forked from h2oai/h2o-3
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths3sync.gradle
144 lines (125 loc) · 4.88 KB
/
s3sync.gradle
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
//
// Supports dataset synchronization with S3
//
buildscript {
repositories {
mavenCentral()
}
dependencies {
classpath 'de.undercouch:gradle-download-task:3.2.0'
classpath 'joda-time:joda-time:2.3'
}
}
// Shows download progress, nice for large file downloads
// We need to reference plugin class directly here: http://mrhaki.blogspot.com/2015/10/gradle-goodness-apply-external-script.html
apply plugin: de.undercouch.gradle.tasks.download.DownloadTaskPlugin
import org.apache.tools.ant.taskdefs.condition.Os
import de.undercouch.gradle.tasks.download.Download
import org.joda.time.format.ISODateTimeFormat
import java.util.regex.Matcher
task syncSmalldata {
group = "Datasets"
description = "Downloads small test data sets from s3 bucket to a local directory. Data set are small test cases for basic testing."
doLast {
syncData('smalldata');
}
}
task syncBigdataLaptop {
group = "Datasets"
description = "Downloads large test data sets from s3 bucket to a local directory. Data sets are big enough to stress a laptop, but not require a cluster."
doLast {
syncData('bigdata/laptop');
}
}
task syncBigdataServer {
group = "Datasets"
description = "Downloads large test data sets from s3 bucket to a local directory. Data sets are big enough to stress a cluster."
doLast {
syncData('bigdata/server');
}
}
task syncFread {
group = "Datasets"
description = "Downloads fread datasets."
doLast {
syncData('fread');
}
}
// sync is based on name, size, and the last modify date, currently md5 tags are ignored
// note this drops the subdir name locally
def syncData(subdir) {
def trimLength = (subdir+File.separator).length()
def localDestDir = (new File("$rootDir")).getCanonicalPath() + File.separator + subdir.replaceAll("/", Matcher.quoteReplacement(File.separator)) + File.separator
def dateFormatter = (new ISODateTimeFormat()).dateTime()
// Local files
FileTree localFiles= fileTree(dir: subdir) // local tree
// List of files which need to be downloaded
def downloadList = new ArrayList<groovy.util.slurpersupport.NodeChild>()
boolean needMoreData = true
String lastKey = ""
while (needMoreData) {
def remoteFiles = new XmlSlurper().parseText(new URL("https://h2o-public-test-data.s3.amazonaws.com/?marker=$lastKey").getText())
if (!remoteFiles.Contents) {
break
}
needMoreData = remoteFiles.IsTruncated == "true"
remoteFiles.Contents.findAll{it.Key =~ subdir}.each { rfile ->
def match = false
def rname = localDestDir + rfile.Key.text().substring(trimLength).replaceAll("/", Matcher.quoteReplacement(File.separator))
def rsize = rfile.Size.text().toLong()
def rdate = (dateFormatter.parseDateTime(rfile.LastModified.text())).getMillis()
if (rsize != 0) {
localFiles.findAll().each { lfile ->
def lname = lfile.path
def lsize = lfile.size()
def ldate = (new Date(lfile.lastModified())).getTime()
if (lname == rname && lsize == rsize && ldate == rdate)
match = true
}
} else { //some software makes 0-byte "directory" files in S3, skip them
match = true
}
if (match == false) {
downloadList.add(rfile)
}
}
lastKey = remoteFiles.Contents.last().Key
}
println("Going to download ${downloadList.size} files...")
downloadList.each {
def fpath = 'https://h2o-public-test-data.s3.amazonaws.com/'+it.Key
def localDestPath = localDestDir + it.Key.text().substring(trimLength).replaceAll("/", Matcher.quoteReplacement(File.separator))
try {
download {
src fpath
dest localDestPath
quiet false
}
} catch (Throwable e) {
println "Failed to download file " + fpath
e.printStackTrace()
throw e
}
// set downloaded files last modified stamp to match server
File newFile = file(localDestPath)
newFile.setLastModified(dateFormatter.parseDateTime(it.LastModified.text()).getMillis())
}
}
task syncRPackages {
group = "RPackages"
description = "Runs R script (h2o-r/scripts/package_verison_check_update.R), which installs R packages from " +
"http://s3.amazonaws.com/h2o-r. The purpose of this task is to ensure that the local system's R packages" +
"match those used by H2O's Jenkins jobs."
doLast {
doRSync();
}
}
def getOsSpecificCommandLine(args) { return Os.isFamily(Os.FAMILY_WINDOWS) ? [ 'cmd', '/c' ] + args : args }
def doRSync() {
def h2oRDir = (new File("$rootDir")).getCanonicalPath() + File.separator + "h2o-r"
def rPkgVerChk = h2oRDir + File.separator + "scripts" + File.separator + "package_version_check_update.R"
exec {
standardInput = System.in
commandLine getOsSpecificCommandLine(['R', "--vanilla", "-f", rPkgVerChk, "--args", "update"])
}
}