forked from wikimedia/pywikibot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfixing_redirects.py
executable file
·254 lines (213 loc) · 8.37 KB
/
fixing_redirects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python3
"""
Correct all redirect links in featured pages or only one page of each wiki.
Can be used with:
-always The bot won't ask for confirmation when putting a page
-featured Run over featured pages (for some Wikimedia wikis only)
-overwrite Usually only the link is changed ([[Foo]] -> [[Bar|Foo]]).
This parameters sets the script to completly overwrite the
link text ([[Foo]] -> [[Bar]]).
-ignoremoves Do not try to solve deleted pages after page move.
¶ms;
"""
#
# (C) Pywikibot team, 2004-2024
#
# Distributed under the terms of the MIT license.
#
from __future__ import annotations
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from contextlib import suppress
import pywikibot
from pywikibot import pagegenerators
from pywikibot.bot import (
AutomaticTWSummaryBot,
ExistingPageBot,
SingleSiteBot,
suggest_help,
)
from pywikibot.exceptions import (
CircularRedirectError,
InterwikiRedirectPageError,
InvalidPageError,
InvalidTitleError,
NoMoveTargetError,
SectionError,
)
from pywikibot.textlib import isDisabled
from pywikibot.tools import first_lower
from pywikibot.tools import first_upper as firstcap
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {'¶ms;': pagegenerators.parameterHelp} # noqa: N816
# Featured articles categories
FEATURED_ARTICLES = 'Q4387444'
class FixingRedirectBot(SingleSiteBot, ExistingPageBot, AutomaticTWSummaryBot):
"""Run over pages and resolve redirect links."""
use_redirects = False
ignore_save_related_errors = True
ignore_server_errors = True
summary_key = 'fixing_redirects-fixing'
update_options = {
'overwrite': False,
'ignoremoves': False,
}
def replace_links(self, text, linked_page, target_page):
"""Replace all source links by target."""
mysite = pywikibot.Site()
linktrail = mysite.linktrail()
# make a backup of the original text so we can show the changes later
link_regex = re.compile(
r'\[\[(?P<title>[^\]\|#]*)(?P<section>#[^\]\|]*)?'
r'(\|(?P<label>[^\]]*))?\]\](?P<linktrail>' + linktrail + ')')
curpos = 0
# This loop will run until we have finished the current page
while True:
m = link_regex.search(text, pos=curpos)
if not m:
break
# Make sure that next time around we will not find this same hit.
curpos = m.start() + 1
try:
is_interwikilink = mysite.isInterwikiLink(m['title'])
except InvalidTitleError:
continue # skip invalid title
# ignore interwiki links, links in the disabled area
# and links to sections of the same page
if (m['title'].strip() == ''
or is_interwikilink
or isDisabled(text, m.start())):
continue
actual_link_page = pywikibot.Page(target_page.site, m['title'])
# Check whether the link found is to page.
try:
actual_link_page.title()
except InvalidTitleError as e:
pywikibot.error(e)
continue
if actual_link_page != linked_page:
continue
# The link looks like this:
# [[page_title|link_text]]trailing_chars
page_title = m['title']
link_text = m['label']
if not link_text:
# or like this: [[page_title]]trailing_chars
link_text = page_title
section = m['section'] or ''
if section and target_page.section():
pywikibot.warning(f'Source section {section} and target '
f'section {target_page} found. Skipping.')
continue
trailing_chars = m['linktrail']
if trailing_chars:
link_text += trailing_chars
# remove preleading ":"
if link_text[0] == ':':
link_text = link_text[1:]
if link_text[0].isupper() or link_text[0].isdigit():
new_page_title = target_page.title()
else:
new_page_title = first_lower(target_page.title())
# remove preleading ":"
if new_page_title[0] == ':':
new_page_title = new_page_title[1:]
if new_page_title == link_text and not section \
or self.opt.overwrite:
newlink = f'[[{new_page_title}]]'
# check if we can create a link with trailing characters instead of
# a pipelink
elif (len(new_page_title) <= len(link_text)
and (firstcap(link_text[:len(new_page_title)])
== firstcap(new_page_title))
and re.sub(re.compile(linktrail), '',
link_text[len(new_page_title):]) == ''
and not section):
length = len(new_page_title)
newlink = f'[[{link_text[:length]}]]{link_text[length:]}'
else:
newlink = f'[[{new_page_title}{section}|{link_text}]]'
text = text[:m.start()] + newlink + text[m.end():]
continue
return text
def get_target(self, page):
"""Get the target page for a given page."""
target = None
if not page.exists():
if not self.opt.ignoremoves:
with suppress(NoMoveTargetError,
CircularRedirectError,
InvalidTitleError):
target = page.moved_target()
elif page.isRedirectPage():
try:
target = page.getRedirectTarget(ignore_section=False)
except (CircularRedirectError,
InvalidTitleError,
InterwikiRedirectPageError):
pass
except (RuntimeError, SectionError) as e:
pywikibot.error(e)
if target is not None \
and target.namespace() in [2, 3] and page.namespace() not in [2, 3]:
target = None
return page, target
def treat_page(self) -> None:
"""Change all redirects from the current page to actual links."""
try:
newtext = self.current_page.text
except InvalidPageError as e:
pywikibot.error(e)
return
with ThreadPoolExecutor() as executor:
futures = {executor.submit(self.get_target, p)
for p in self.current_page.linkedPages()}
for future in as_completed(futures):
page, target = future.result()
if target:
newtext = self.replace_links(newtext, page, target)
self.put_current(newtext)
def main(*args: str) -> None:
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
:param args: command line arguments
"""
featured = False
options = {}
gen = None
# Process global args and prepare generator args parser
gen_factory = pagegenerators.GeneratorFactory()
local_args = pywikibot.handle_args(args)
local_args = gen_factory.handle_args(local_args)
unknown = []
for arg in local_args:
if arg == '-featured':
featured = True
elif arg in ('-always', '-ignoremoves', '-overwrite'):
options[arg[1:]] = True
else:
unknown.append(arg)
suggest_help(unknown_parameters=unknown)
mysite = pywikibot.Site()
if mysite.sitename == 'wikipedia:nl':
pywikibot.info(
'<<lightred>>There is consensus on the Dutch Wikipedia that '
'bots should not be used to fix redirects.')
return
if featured:
ref = mysite.page_from_repository(FEATURED_ARTICLES)
if ref is not None:
gen = ref.articles(namespaces=0, content=True)
if not gen:
suggest_help(
unknown_parameters=['-featured'],
additional_text='Option is not available for this site.')
return
else:
gen = gen_factory.getCombinedGenerator(preload=True)
bot = FixingRedirectBot(generator=gen, **options)
bot.run()
if __name__ == '__main__':
main()