Skip to content

Commit 67572bf

Browse files
committed
UTILS: script
1 parent 512a813 commit 67572bf

File tree

1 file changed

+203
-0
lines changed

1 file changed

+203
-0
lines changed

_websiteutils/make_redirects_links.py

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import argparse
2+
import glob
3+
import logging
4+
import multiprocessing
5+
import os
6+
import pathlib
7+
import re
8+
import subprocess
9+
import sys
10+
import tempfile
11+
12+
13+
"""
14+
This script does two things that improve the website organization.
15+
16+
First, we used to host in the root of the webpage, but have now moved to
17+
``/stable/``. We do not want obsolete links to link to nothing (or that has
18+
been our policy), so we currently just keep the old version at the top level.
19+
Here, instead, we either softlink to the newest version, or replace the file by
20+
an html refresh redirect.
21+
22+
Second, it changes the canonical link in each html file to the newest version
23+
found of the html file (including stable if its in the latest version.)
24+
25+
This script takes a while, and is destructive, so should probably be run on a
26+
branch and pushed as a PR so it can easily be reverted.
27+
"""
28+
29+
_log = logging.getLogger('make_redirect_links')
30+
31+
32+
tocheck = ['stable'] + [f'{major}.{minor}.{micro}'
33+
for major in range(6, -1, -1)
34+
for minor in range(6, -1, -1)
35+
for micro in range(6, -1, -1)]
36+
37+
toignore = tocheck + ['mpl-probscale', 'mpl_examples', 'mpl_toolkits',
38+
'_webpageutils', 'xkcd', 'sitemap.xml',
39+
'robots.txt', 'CNAME', '.git']
40+
41+
logging.basicConfig(level=logging.DEBUG)
42+
43+
44+
def findlast(fname, tocheck):
45+
"""
46+
Check the directories listed in ``tocheck`` to see if they have
47+
``fname`` in them. Return the first one found, or None
48+
"""
49+
p = pathlib.Path(fname)
50+
for t in tocheck:
51+
pnew = pathlib.Path(t, p)
52+
if pnew.exists():
53+
return t
54+
else:
55+
return None
56+
57+
html_redirect = """
58+
<!DOCTYPE HTML>
59+
<html lang="en">
60+
<head>
61+
<meta charset="utf-8">
62+
<meta http-equiv="refresh" content="0;url=https://matplotlib.org%s" />
63+
<link rel="canonical" href="https://matplotlib.org%s" />
64+
</head>
65+
<body>
66+
<h1>
67+
The page been moved to <a href="https://matplotlib.org%s"</a>
68+
</h1>
69+
</body>
70+
</html>
71+
"""
72+
73+
74+
def do_links(root0):
75+
"""
76+
Either soft link a file at the top level to its newest position,
77+
or make an html redirect if it is an html file.
78+
"""
79+
_log.info(f'Doing links on {root0}')
80+
for root, dirs, files in os.walk(root0):
81+
for name in files:
82+
fullname = os.path.join(root, name)
83+
last = findlast(fullname, tocheck)
84+
_log.debug(f'Checking: {fullname} found {last}')
85+
if last is not None:
86+
os.remove(fullname)
87+
if name.endswith(('.htm', '.html')):
88+
# make an html redirect.
89+
_log.info(f'Rewriting HTML: {fullname} in {last}')
90+
with open(fullname, 'w') as fout:
91+
oldname = '/' + os.path.join(last, fullname)
92+
st = html_redirect % (oldname, oldname, oldname)
93+
fout.write(st)
94+
else:
95+
# soft link
96+
# Need to do these relative to where the link is
97+
# so if it is a level down `ln -s ../3.1.1/boo/who boo/who`
98+
last = os.path.join('..', last)
99+
depth = root.count('/')
100+
for i in range(depth):
101+
last = os.path.join('..', last)
102+
oldname = os.path.join(last, fullname)
103+
_log.info(f'Linking {fullname} to {oldname}')
104+
os.symlink(oldname, fullname)
105+
for d in dirs:
106+
do_links(d)
107+
108+
109+
def do_canonicals(dname):
110+
"""
111+
For each html file in the versioned docs, make the canonical link point
112+
to the newest version.
113+
"""
114+
_log.debug(f'Walking {dname}')
115+
for root, dirs, files in os.walk(dname):
116+
for name in files:
117+
fullname = os.path.join(root, name)
118+
p = pathlib.Path(fullname)
119+
_log.debug(f'Checking {fullname}')
120+
if name.endswith(('.htm', '.html')):
121+
basename = pathlib.Path(*p.parts[1:])
122+
last = findlast(basename, tocheck)
123+
if last is not None:
124+
update_canonical(fullname, last)
125+
126+
for d in dirs:
127+
_log.info(f'DIR: {d}')
128+
do_canonicals(os.path.join(dname,d))
129+
130+
131+
def update_canonical(fullname, last):
132+
"""
133+
Change the canonical link in *fullname* to the same link in the
134+
version given by *last*. We do this with a regexp to prevent
135+
removing any other content on a line that has the canonical link.
136+
137+
Note that if for some reason there are more than one canonical link
138+
this will change all of them.
139+
"""
140+
p = pathlib.Path(fullname)
141+
pre = 'https://matplotlib.org/'
142+
pnew = pathlib.Path(last, *p.parts[1:])
143+
newcanon = f'{pre+str(pnew)}'
144+
_log.info(f'{p} to {pre+str(pnew)}')
145+
with tempfile.NamedTemporaryFile(delete=False) as fout:
146+
with open(fullname, 'rb') as fin:
147+
for line in fin:
148+
if b'<link rel="canonical"' in line:
149+
new = bytes(f'<link rel="canonical" href="{newcanon}"',
150+
encoding='utf-8')
151+
ll = re.sub(b'<link rel="canonical" href=".*"', new,
152+
line)
153+
_log.debug(f'new {line}->{ll}')
154+
fout.write(ll)
155+
else:
156+
fout.write(line)
157+
os.rename(fout.name, fullname)
158+
159+
160+
if __name__ == "__main__":
161+
162+
parser = argparse.ArgumentParser(description='Optional app description')
163+
164+
parser.add_argument('--np', type=int, help='Number of processors to use')
165+
parser.add_argument('--no_canonicals', help='do not do canonical links',
166+
action="store_true")
167+
parser.add_argument('--no_redirects', help='do not do redirects links',
168+
action="store_true")
169+
170+
args = parser.parse_args()
171+
if args.np:
172+
np = args.np
173+
else:
174+
np = None
175+
176+
# html redirect or soft link most things in the top-level directory that
177+
# are not other modules or versioned docs.
178+
if not args.no_redirects:
179+
for entry in os.scandir('./'):
180+
if not (entry.name in toignore):
181+
if entry.is_dir():
182+
do_links(entry.name)
183+
elif entry.name.endswith(('.htm', '.html')):
184+
fullname = entry.name
185+
last = findlast(fullname, tocheck)
186+
_log.debug(f'Checking: {fullname} found {last}')
187+
if last is not None:
188+
os.remove('./'+fullname)
189+
_log.info(f'Rewriting HTML: {fullname} in {last}')
190+
with open(fullname, 'w') as fout:
191+
oldname = '/' + os.path.join(last, fullname)
192+
st = html_redirect % (oldname, oldname, oldname)
193+
fout.write(st)
194+
_log.info('Done links and redirects')
195+
196+
# change the canonical url for all html to the newest version in the docs:
197+
if not args.no_canonicals:
198+
if np is not None:
199+
with multiprocessing.Pool(np) as pool:
200+
pool.map(do_canonicals, tocheck[1:])
201+
else:
202+
for t in tocheck[1:]:
203+
do_canonicals(t)

0 commit comments

Comments
 (0)