-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathbacklinks.py
137 lines (100 loc) · 4.01 KB
/
backlinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Regenerate the "backlinks" section on each page.
Should be used after you made modifications to any page that either changed the title or changed any links inside the page.
Needs beautifulsoup, e.g. `pip install beautifulsoup4`
Slow, will take ~3-5mins!
"""
import os
import re
import json
import functools
import collections
from bs4 import BeautifulSoup
def get_html_files():
return [f for f in os.listdir(src_folder_path()) if re.match(r'^.*\.html$', f)]
def folder_path():
return os.path.dirname(os.path.realpath(__file__))
def src_folder_path():
return os.path.join(folder_path(), "pages/")
def hneedle(t):
return t.name in ['h3', 'h4', 'h2', 'h1'] and t.find_parent('div', class_="section top-most")
def bneedle(t):
if t.name == 'div' and 'section' in t.get('class', []):
h3 = t.find('h3')
if h3 and h3.text == 'Backlinks':
return True
return False
def run():
files = get_html_files()
file_names = [f[:-5] for f in files]
bl = {}
flen = len(files)
for ix, f in enumerate(files):
if (ix +1) % 10 == 0:
print(f"Collecting links ... [{ix+1}/{flen}]")
if f in ["index.html", "Glossary.html", "unknown_node.html", "setf-classname.html"]:
continue
fpath = os.path.join(src_folder_path(), f)
# this will be used in the link text
title = None
with open(fpath, 'r') as r:
html = r.read()
soup = BeautifulSoup(html, 'html.parser')
header = soup.find(hneedle)
if header.string is None:
title = re.sub("</?[bi]>", "", str(header.encode_contents())).strip()
else:
title = header.string.strip()
assert(title is not None)
title = title.replace("\n", " ").replace(" ", " ")
backlinks = soup.find(bneedle)
# remove backlinks and nav to not find any anchors from these
if backlinks:
backlinks.decompose()
nav = soup.find("div", {"class":"nav"})
if nav:
nav.decompose()
body_main = soup.find("div", { "class": "body__main__inner"})
if body_main is None:
print(fpath)
continue
anchors = body_main.find_all('a', href=True)
anchors = [(a.string, a['href']) for a in anchors]
for txt, href in anchors:
if "#" in href:
href = href[:href.index('#')]
bl.setdefault(href, [])
bl[href].append((f, title))
for ix, f in enumerate(files):
if (ix +1) % 10 == 0:
print(f"Inserting links ... [{ix+1}/{flen}]")
if f == "index.html" or f == "Glossary.html":
continue
fpath = os.path.join(src_folder_path(), f)
if f not in bl:
continue
bls = bl[f]
if bls == []:
continue
html = ""
with open(fpath, 'r') as r:
html = r.read()
soup = BeautifulSoup(html, 'html.parser')
bl_list = [f"<a href=\"{b[0]}\">{b[1]}</a>, " for b in set(bls) if b[0] != f]
bl_list = "".join(sorted(bl_list, key = lambda b: b[b.index(">"):].lower()))
bl_list = bl_list[:-2]
blsection = soup.find(bneedle)
blhtml = BeautifulSoup(f"""<h3>Backlinks</h3>
{bl_list} """, "html.parser")
if blsection is None:
new_section = soup.new_tag("div", **{"class": "section"})
new_section.append(blhtml)
body_main = soup.find("div", { "class": "body__main__inner"})
body_main.append(new_section)
else:
blsection.clear()
blsection.append(blhtml)
html = str(soup)
with open(fpath, "w") as o:
o.write(html)
run()