-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathextractor.py
executable file
·33 lines (27 loc) · 1.33 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#! /usr/bin/python
import re
import os
def parse(document_name):
with open(document_name, 'r') as html_file:
pattern = re.compile('https://exhentai.org/g/[0-9]+/[0-9a-z]{10}/')
file_text = html_file.read()
matches = re.findall(pattern, file_text)
return matches
if __name__ == "__main__":
with open("urls.txt", 'w') as result_file:
document_name = "/Users/christophorus/Downloads/php/view-source_https___exhentai.org_favorites.php.html"
if os.path.isfile(document_name):
print("Now parsing document " + document_name)
result_url_list = parse(document_name)
for line in result_url_list[1::2]:
result_file.write(" " + line)
print("Document " + document_name + " has been parsed!")
for index in range(1, 100):
document_name = "/Users/christophorus/Downloads/php/view-source_https___exhentai.org_favorites.php_page={}.html".format(index)
if os.path.isfile(document_name):
print("Now parsing document " + document_name)
result_url_list = parse(document_name)
for line in result_url_list[1::2]:
result_file.write(" " + line)
print("Document " + document_name + " has been parsed!")
print("All documents are parsed!!!")