forked from apache/predictionio
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Use pack for U2I training-test split wrapper
- Loading branch information
Showing
7 changed files
with
175 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
25 changes: 17 additions & 8 deletions
25
process/engines/commons/evaluations/scala/u2itrainingtestsplit/build.sbt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,21 @@ | ||
import AssemblyKeys._ | ||
|
||
assemblySettings | ||
import xerial.sbt.Pack._ | ||
|
||
name := "predictionio-process-commons-evaluations-scala-u2itrainingtestsplittime" | ||
|
||
libraryDependencies += "com.twitter" %% "scalding-args" % "0.8.6" | ||
libraryDependencies ++= Seq( | ||
"ch.qos.logback" % "logback-classic" % "1.1.1", | ||
"ch.qos.logback" % "logback-core" % "1.1.1", | ||
"com.github.scopt" %% "scopt" % "3.2.0", | ||
"org.clapper" %% "grizzled-slf4j" % "1.0.1") | ||
|
||
packSettings | ||
|
||
packJarNameConvention := "full" | ||
|
||
packExpandedClasspath := true | ||
|
||
packGenerateWindowsBatFile := false | ||
|
||
packMain := Map("u2itrainingtestsplit" -> "io.prediction.evaluations.commons.trainingtestsplit.U2ITrainingTestSplitTime") | ||
|
||
excludedJars in assembly <<= (fullClasspath in assembly) map { cp => | ||
val excludes = Set("minlog-1.2.jar") | ||
cp filter { jar => excludes(jar.data.getName)} | ||
} | ||
packJvmOpts := Map("u2itrainingtestsplit" -> Common.packCommonJvmOpts) |
150 changes: 150 additions & 0 deletions
150
...mons/evaluations/scala/u2itrainingtestsplit/src/main/scala/U2ITrainingTestSplitTime.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
package io.prediction.evaluations.commons.trainingtestsplit | ||
|
||
import io.prediction.commons.filepath.U2ITrainingTestSplitFile | ||
|
||
import java.io.File | ||
import scala.io.Source | ||
import scala.sys.process._ | ||
|
||
import grizzled.slf4j.Logger | ||
|
||
case class U2ITrainingTestSplitTimeConfig( | ||
hadoop: String = "", | ||
pdioEvalJar: String = "", | ||
hdfsRoot: String = "", | ||
localTempRoot: String = "", | ||
appid: Int = 0, | ||
engineid: Int = 0, | ||
evalid: Int = 0, | ||
sequenceNum: Int = 0) | ||
|
||
/** | ||
* Wrapper for Scalding U2ITrainingTestSplitTime job | ||
* | ||
* Args: | ||
* --hadoop <string> hadoop command | ||
* --pdioEvalJar <string> the name of the Scalding U2ITrainingTestSplit job jar | ||
* --sequenceNum. <int>. the sequence number (starts from 1 for the 1st iteration and then increment for later iterations) | ||
* | ||
* --dbType: <string> appdata DB type | ||
* --dbName: <string> | ||
* --dbHost: <string>. optional. (eg. "127.0.0.1") | ||
* --dbPort: <int>. optional. (eg. 27017) | ||
* | ||
* --training_dbType: <string> training_appadta DB type | ||
* --training_dbName: <string> | ||
* --training_dbHost: <string>. optional | ||
* --training_dbPort: <int>. optional | ||
* | ||
* --validation_dbType: <string> validation_appdata DB type | ||
* --validation_dbName: <string> | ||
* --validation_dbHost: <string>. optional | ||
* --validation_dbPort: <int>. optional | ||
* | ||
* --test_dbType: <string> test_appdata DB type | ||
* --test_dbName: <string> | ||
* --test_dbHost: <string>. optional | ||
* --test_dbPort: <int>. optional | ||
* | ||
* --hdfsRoot: <string>. Root directory of the HDFS | ||
* | ||
* --appid: <int> | ||
* --engineid: <int> | ||
* --evalid: <int> | ||
* | ||
* --itypes: <string separated by white space>. eg "--itypes type1 type2". If no --itypes specified, then ALL itypes will be used. | ||
* | ||
* --trainingPercent: <double> (0.01 to 1). training set percentage | ||
* --validationPercent: <dboule> (0.01 to 1). validation set percentage | ||
* --testPercent: <double> (0.01 to 1). test set percentage | ||
* --timeorder: <boolean>. Require total percentage < 1 | ||
* | ||
*/ | ||
object U2ITrainingTestSplitTime { | ||
def main(args: Array[String]) { | ||
val parser = new scopt.OptionParser[U2ITrainingTestSplitTimeConfig]("u2itrainingtestsplit") { | ||
head("u2itrainingtestsplit") | ||
opt[String]("hadoop") required () action { (x, c) => | ||
c.copy(hadoop = x) | ||
} text ("path to the 'hadoop' command") | ||
opt[String]("pdioEvalJar") required () action { (x, c) => | ||
c.copy(pdioEvalJar = x) | ||
} text ("path to PredictionIO Hadoop job JAR") | ||
opt[String]("hdfsRoot") required () action { (x, c) => | ||
c.copy(hdfsRoot = x) | ||
} text ("PredictionIO root path in HDFS") | ||
opt[String]("localTempRoot") required () action { (x, c) => | ||
c.copy(localTempRoot = x) | ||
} text ("local directory for temporary storage") | ||
opt[Int]("appid") required () action { (x, c) => | ||
c.copy(appid = x) | ||
} text ("the App ID of this offline evaluation") | ||
opt[Int]("engineid") required () action { (x, c) => | ||
c.copy(engineid = x) | ||
} text ("the Engine ID of this offline evaluation") | ||
opt[Int]("evalid") required () action { (x, c) => | ||
c.copy(evalid = x) | ||
} text ("the OfflineEval ID of this offline evaluation") | ||
opt[Int]("sequenceNum") required () action { (x, c) => | ||
c.copy(sequenceNum = x) | ||
} validate { x => | ||
if (x >= 1) success else failure("--sequenceNum must be >= 1") | ||
} text ("sequence (iteration) number of the offline evaluation") | ||
} | ||
val logger = Logger(U2ITrainingTestSplitTime.getClass) | ||
|
||
parser.parse(args, U2ITrainingTestSplitTimeConfig()) map { config => | ||
val hadoop = config.hadoop | ||
val pdioEvalJar = config.pdioEvalJar | ||
val hdfsRoot = config.hdfsRoot | ||
val localTempRoot = config.localTempRoot | ||
val appid = config.appid | ||
val engineid = config.engineid | ||
val evalid = config.evalid | ||
val sequenceNum = config.sequenceNum | ||
val argsString = args.toString | ||
val resplit = sequenceNum > 1 | ||
|
||
/** command */ | ||
if (!resplit) { | ||
// prep | ||
val splitPrepCmd = hadoop + " jar " + pdioEvalJar + " io.prediction.evaluations.scalding.commons.u2itrainingtestsplit.U2ITrainingTestSplitTimePrep " + argsString | ||
executeCommandAndCheck(splitPrepCmd) | ||
} | ||
|
||
// copy the count to local tmp | ||
val hdfsCountPath = U2ITrainingTestSplitFile(hdfsRoot, appid, engineid, evalid, "u2iCount.tsv") | ||
val localCountPath = localTempRoot + "eval-" + evalid + "-u2iCount.tsv" | ||
val localCountFile = new File(localCountPath) | ||
|
||
// create parent dir | ||
localCountFile.getParentFile().mkdirs() | ||
|
||
// delete existing file first | ||
if (localCountFile.exists()) localCountFile.delete() | ||
|
||
// get the count from hdfs | ||
val getHdfsCountCmd = hadoop + " fs -getmerge " + hdfsCountPath + " " + localCountPath | ||
executeCommandAndCheck(getHdfsCountCmd) | ||
|
||
// read the local file and get the count | ||
val lines = Source.fromFile(localCountPath).getLines | ||
if (lines.isEmpty) throw new RuntimeException(s"Count file $localCountPath is empty") | ||
|
||
val count = lines.next | ||
|
||
// split | ||
val splitCmd = hadoop + " jar " + pdioEvalJar + " io.prediction.evaluations.scalding.commons.u2itrainingtestsplit.U2ITrainingTestSplitTime " + argsString + " --totalCount " + count | ||
executeCommandAndCheck(splitCmd) | ||
|
||
// delete local tmp file | ||
logger.info(s"Deleting temporary file $localCountPath...") | ||
localCountFile.delete() | ||
} | ||
|
||
def executeCommandAndCheck(cmd: String) = { | ||
logger.info(s"Executing $cmd...") | ||
if ((cmd.!) != 0) throw new RuntimeException(s"Failed to execute '$cmd'") | ||
} | ||
} | ||
} |
127 changes: 0 additions & 127 deletions
127
...a/io/predictionio/evaluations/commons/u2itrainingtestsplit/U2ITrainingTestSplitTime.scala
This file was deleted.
Oops, something went wrong.