forked from volpino/wiki-network
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdownload_event_date.py
executable file
·80 lines (71 loc) · 2.81 KB
/
download_event_date.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
##########################################################################
# #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation; version 2 of the License. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
##########################################################################
import simplejson
import urllib
import sys
import csv
import logging
import re
def get_event_date(title, csv_writer, lang):
api_base = 'http://%s.wikipedia.org/w/api.php' % lang
options = {}
options.update({
'action': 'query',
'prop': 'revisions',
'titles': title,
'rvprop': 'content',
'format': 'json'
})
url = api_base + '?' + urllib.urlencode(options)
logging.info(url)
result = simplejson.load(urllib.urlopen(url))
pages = result["query"]["pages"]
for page in pages:
content = pages[page]["revisions"][0]["*"]
dates = []
for line in content.split("\n"):
result = re.search("date\s+?\=(.*)", line)
if result:
data = result.group(0)
if data:
result = re.search("(\d{4})", data)
if result:
dates.append(result.group(0))
csv_writer.writerow([title, " ".join(set(dates))])
def main():
import optparse
p = optparse.OptionParser(
usage="usage: %prog [options] page_title output_file")
p.add_option('-l', '--lang', action="store", dest="lang", default="en",
help="Wikipedia language")
p.add_option('-c', '--clean', action="store_true", dest="clean",
help="Clean wiki syntax / HTML")
opts, files = p.parse_args()
if len(files) != 2:
p.error("Wrong parameters")
logging.basicConfig(stream=sys.stderr,
level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
csv_reader = csv.reader(open(files[0], "r"))
csv_writer = csv.writer(
open(files[1], "w"),
delimiter="\t",
quotechar='"',
quoting=csv.QUOTE_ALL
)
for page in csv_reader:
get_event_date(page[0], csv_writer, opts.lang)
if __name__ == "__main__":
main()