forked from lukas-blecher/LaTeX-OCR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
87 lines (75 loc) · 2.86 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import sys
import random
from tqdm import tqdm
import html
import requests
import re
import tempfile
try:
from arxiv import *
from extract_latex import *
except:
from dataset.arxiv import *
from dataset.extract_latex import *
wikilinks = re.compile(r'href="/wiki/(.*?)"')
htmltags = re.compile(r'<(noscript|script)>.*?<\/\1>', re.S)
wiki_base = 'https://en.wikipedia.org/wiki/'
def parse_url(url, encoding=None):
r = requests.get(url)
if r.ok:
if encoding:
r.encoding = encoding
return html.unescape(re.sub(htmltags, '', r.text))
def parse_wiki(url):
text = parse_url(url)
linked = list(set([l for l in re.findall(wikilinks, text) if not ':' in l]))
return find_math(text, wiki=True), linked
# recursive search
def recursive_search(parser, seeds, depth=2, skip=[], unit='links', base_url=None):
visited, links = set(skip), set(seeds)
math = []
try:
for i in range(int(depth)):
link_list = list(links)
random.shuffle(link_list)
t_bar = tqdm(link_list, initial=len(visited), unit=unit)
for link in t_bar:
if not link in visited:
t_bar.set_description('searching %s' % (link))
if base_url:
m, l = parser(base_url+link)
else:
m, l = parser(link)
# check if we got any math from this wiki page and
# if not terminate the tree
if len(m) > 0:
for li in l:
links.add(li)
t_bar.total = len(links)
math.extend(m)
visited.add(link)
return list(visited), list(set(math))
except Exception as e:
raise(e)
return list(visited), list(set(math))
except KeyboardInterrupt:
return list(visited), list(set(math))
# recursive wiki search
def recursive_wiki(seeds, depth=4, skip=[]):
'''Recursivley search wikipedia for math. Every link on the starting page `start` will be visited in the next round and so on, until there is no
math in the child page anymore. This will be repeated `depth` times.'''
start = [s.split('/')[-1] for s in seeds]
return recursive_search(parse_wiki, start, depth, skip, base_url=wiki_base, unit='links')
if __name__ == '__main__':
if len(sys.argv) > 2:
url = [sys.argv[1]]
else:
url = ['https://en.wikipedia.org/wiki/Mathematics', 'https://en.wikipedia.org/wiki/Physics']
visited, math = recursive_wiki(url)
for l, name in zip([visited, math], ['visited_wiki.txt', 'math_wiki.txt']):
f = open(os.path.join(sys.path[0], 'dataset', 'data', name), 'a', encoding='utf-8')
for element in l:
f.write(element)
f.write('\n')
f.close()