This repository has been archived by the owner on Nov 26, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathdownload_images.py
83 lines (74 loc) · 2.29 KB
/
download_images.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
# Description: downloads the smallest cropped image captures from a list of items
# Example usage:
# python download_images.py ../data/captures.json ../img/items/ b
# Derivative types:
# t: 150
# b: 100 (cropped)
# f: 192
# r: 300
# w: 760
# q: 1600
# v: 2560
# g: Original
import json
import os
from PIL import Image
import sys
import urllib2
# input
if len(sys.argv) < 2:
print "Usage: %s <inputfile item captures json> <outputdir for images> <derivative code>" % sys.argv[0]
sys.exit(1)
INPUT_FILE = sys.argv[1]
OUTPUT_DIR = sys.argv[2]
DERIVATIVE_CODE = sys.argv[3]
# config
overwriteExisting = False
imageURLPattern = "http://images.nypl.org/index.php?id=%s&t=" + DERIVATIVE_CODE
imageExt = "jpg"
items = []
count = 0
successCount = 0
skipCount = 0
failCount = 0
def isValidImage(fileName):
isValid = True
try:
im=Image.open(fileName)
# do stuff
except IOError:
# filename not an image file
isValid = False
except:
isValid = False
return isValid
with open(INPUT_FILE) as data_file:
items = json.load(data_file)
itemCount = len(items)
print "Downloading " + str(itemCount) + " captures..."
for captureId in items:
imageURL = imageURLPattern % captureId
fileName = OUTPUT_DIR + captureId + "." + imageExt
# save file if not found or overwrite is set to True
if overwriteExisting or not os.path.isfile(fileName) or not isValidImage(fileName):
with open(fileName, 'wb') as f:
try:
f.write(urllib2.urlopen(imageURL).read())
f.close()
successCount += 1
print str(count) + ". Downloaded " + imageURL + " (" + str(round(1.0 * count / itemCount * 100, 3)) + "%)"
except urllib2.URLError, e:
failCount += 1
print str(count) + ". URL error: " + imageURL , e.args
except:
failCount += 1
print str(count) + ". Unexpected error: " + imageURL , sys.exc_info()[0]
raise
else:
skipCount += 1
# print str(count) + ". Skipped " + imageURL
count += 1
print "Downloaded " + str(successCount) + " images."
print "Skipped " + str(skipCount) + " images."
print "Failed to download " + str(failCount) + " images."