This repository was archived by the owner on Dec 23, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathunodc.py
57 lines (49 loc) · 1.79 KB
/
unodc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import messytables
import xypath
import dl
import re
import orm
"""Value: dsID, region, indID, period, value, source, is_number
DataSet: dsID, last_updated, last_scraped, name
Indicator: indID, name, units
"""
SHEET_URL = "http://www.unodc.org/documents/data-and-analysis/statistics/crime/CTS_Sexual_violence.xls"
REGION_HEADER_VALUE = "Country/territory"
DSID = "unodc"
INDID = "TT029"
IND_NAME = "Total sexual violence at the national level, number of police-recorded offences"
def save_dataset():
dataset = {"dsID": DSID,
"last_updated": None,
"last_scraped": orm.now(),
"name": "United Nations Office on Drugs and Crime"}
orm.DataSet(**dataset).save()
def save_indicator():
indicator = {"indID": INDID,
"name": IND_NAME,
"units": "count"}
orm.Indicator(**indicator).save()
def main():
save_dataset()
save_indicator()
raw = dl.grab(SHEET_URL)
mtables = messytables.any.any_tableset(raw)
table = xypath.Table.from_messy(mtables.tables[0])
table.filter(IND_NAME).assert_one() # we have the right table
region_header = table.filter(REGION_HEADER_VALUE).assert_one()
regions = region_header.fill(xypath.DOWN)
years = region_header.fill(xypath.RIGHT, stop_before=lambda c: c.value == '')
assert len(years)<15 # left side.
for region_cell, year_cell, value_cell in regions.junction(years):
value = {"dsID": DSID,
'region': region_cell.value,
'indID': INDID,
'source': SHEET_URL,
'is_number': True,
'period': year_cell.value,
'value': value_cell.value
}
orm.Value(**value).save()
orm.session.commit()
if __name__=="__main__":
main()