This repository has been archived by the owner on Nov 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathget_genres.py
117 lines (103 loc) · 3.17 KB
/
get_genres.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# -*- coding: utf-8 -*-
# Description: retrieves the genre of items; attempts to normalize genre text
# Example usage:
# python get_genres.py ../data/src/pd_items.json ../data/genres.json ../data/item_genres.json
from collections import Counter
import json
from pprint import pprint
import re
import sys
import urllib
# input
if len(sys.argv) < 3:
print "Usage: %s <inputfile items json> <outputfile genres json> <outputfile item genres json>" % sys.argv[0]
sys.exit(1)
INPUT_FILE = sys.argv[1]
OUTPUT_FILE = sys.argv[2]
OUTPUT_ITEMS_FILE = sys.argv[3]
# if string contains [x]: replace with [y]
substrings = {
'lithograph': 'lithographs',
'lithogrpah': 'lithographs',
'litograph': 'lithographs',
'lithgraph': 'lithographs',
'engraving': 'engravings',
'engraing': 'engravings',
'engraver': 'engravings',
'print': 'prints',
'etching': 'etchings',
'drawing': 'drawings',
'pencil': 'drawings',
'watercolor': 'watercolors',
'gouache': 'watercolors',
'ink': 'drawings',
'photograph': 'photographs',
'book': 'books',
'monograph': 'books',
'atlas': 'atlases',
'painting': 'paintings',
'illustration': 'illustrations',
'map': 'maps',
'cartographic': 'maps',
'pen': 'drawings',
'manuscript': 'manuscripts',
'document': 'documents',
'scroll': 'scrolls',
'scores': 'sheet music',
'musical notation': 'sheet music'
}
# init
genres = []
item_genres = []
def addGenre(g):
global genres
global item_genres
genre = next(iter([_g for _g in genres if _g['value']==g]), False)
if genre:
genres[genre['index']]['count'] += 1
else:
label = 'Unknown'
url = ''
if g:
label = g.capitalize()
url = 'http://digitalcollections.nypl.org/search/index?utf8=✓&keywords=&filters%5Brights%5D=pd&filters%5Bgenre%5D=' + urllib.quote(label)
genre = {
'index': len(genres),
'value': g,
'label': label,
'url': url,
'count': 1
}
genres.append(genre)
item_genres.append(genre['index'])
for line in open(INPUT_FILE,'r').readlines():
# Read line as json
item = json.loads(line)
# Retrieve genre
genre = ""
if "genres" in item and len(item["genres"]) > 0:
for g in item["genres"]:
g = str(g["text"].encode("utf_8"))
# Make lowercase exclude everything after divider
g = g.lower().split(" -- ")[0]
# Remove non-ASCII chars
g = re.sub(r'[^\x00-\x7F]+','', g)
# Trim string
g = g.strip()
for s in substrings:
if s in g:
g = substrings[s]
break
genre = g
break
addGenre(genre)
# Report on collections
genres = sorted(genres, key=lambda d: d['count'], reverse=True)
pprint(genres)
# Write out data
with open(OUTPUT_FILE, 'w') as outfile:
json.dump(genres, outfile)
print "Wrote " + str(len(genres)) + " genres to " + OUTPUT_FILE
with open(OUTPUT_ITEMS_FILE, 'w') as outfile:
json.dump(item_genres, outfile)
print "Wrote " + str(len(item_genres)) + " items to " + OUTPUT_ITEMS_FILE