Skip to content

Commit

Permalink
SPARK-1064
Browse files Browse the repository at this point in the history
This reopens PR 649 from incubator-spark against the new repo

Author: Sandy Ryza <[email protected]>

Closes apache#102 from sryza/sandy-spark-1064 and squashes the following commits:

270e490 [Sandy Ryza] Handle different application classpath variables in different versions
88b04e0 [Sandy Ryza] SPARK-1064. Make it possible to run on YARN without bundling Hadoop jars in Spark assembly
  • Loading branch information
sryza authored and pwendell committed Mar 12, 2014
1 parent 16788a6 commit 2409af9
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 1 deletion.
6 changes: 6 additions & 0 deletions docs/building-with-maven.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,9 @@ Running only java 8 tests and nothing else.
Java 8 tests are run when -Pjava8-tests profile is enabled, they will run in spite of -DskipTests.
For these tests to run your system must have a JDK 8 installation.
If you have JDK 8 installed but it is not the system default, you can set JAVA_HOME to point to JDK 8 before running the tests.

## Packaging without Hadoop dependencies for deployment on YARN ##

The assembly jar produced by "mvn package" will, by default, include all of Spark's dependencies, including Hadoop and some of its ecosystem projects. On YARN deployments, this causes multiple versions of these to appear on executor classpaths: the version packaged in the Spark assembly and the version on each node, included with yarn.application.classpath. The "hadoop-provided" profile builds the assembly without including Hadoop-ecosystem projects, like ZooKeeper and Hadoop itself.


46 changes: 46 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -807,5 +807,51 @@
</modules>

</profile>

<!-- Build without Hadoop dependencies that are included in some runtime environments. -->
<profile>
<id>hadoop-provided</id>
<activation>
<activeByDefault>false</activeByDefault>
</activation>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-api</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro-ipc</artifactId>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
<scope>provided</scope>
</dependency>
</dependencies>
</profile>

</profiles>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@ import org.apache.hadoop.fs._
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.DataOutputBuffer
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.mapreduce.MRJobConfig
import org.apache.hadoop.net.NetUtils
import org.apache.hadoop.security.UserGroupInformation
import org.apache.hadoop.util.StringUtils
import org.apache.hadoop.yarn.api._
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment
import org.apache.hadoop.yarn.api.protocolrecords._
Expand Down Expand Up @@ -379,9 +381,48 @@ object ClientBase {

// Based on code from org.apache.hadoop.mapreduce.v2.util.MRApps
def populateHadoopClasspath(conf: Configuration, env: HashMap[String, String]) {
for (c <- conf.getStrings(YarnConfiguration.YARN_APPLICATION_CLASSPATH)) {
val classpathEntries = Option(conf.getStrings(
YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse(
getDefaultYarnApplicationClasspath())
for (c <- classpathEntries) {
Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim)
}

val mrClasspathEntries = Option(conf.getStrings(
"mapreduce.application.classpath")).getOrElse(
getDefaultMRApplicationClasspath())
if (mrClasspathEntries != null) {
for (c <- mrClasspathEntries) {
Apps.addToEnvironment(env, Environment.CLASSPATH.name, c.trim)
}
}
}

def getDefaultYarnApplicationClasspath(): Array[String] = {
try {
val field = classOf[MRJobConfig].getField("DEFAULT_YARN_APPLICATION_CLASSPATH")
field.get(null).asInstanceOf[Array[String]]
} catch {
case err: NoSuchFieldError => null
}
}

/**
* In Hadoop 0.23, the MR application classpath comes with the YARN application
* classpath. In Hadoop 2.0, it's an array of Strings, and in 2.2+ it's a String.
* So we need to use reflection to retrieve it.
*/
def getDefaultMRApplicationClasspath(): Array[String] = {
try {
val field = classOf[MRJobConfig].getField("DEFAULT_MAPREDUCE_APPLICATION_CLASSPATH")
if (field.getType == classOf[String]) {
StringUtils.getStrings(field.get(null).asInstanceOf[String])
} else {
field.get(null).asInstanceOf[Array[String]]
}
} catch {
case err: NoSuchFieldError => null
}
}

def populateClasspath(conf: Configuration, sparkConf: SparkConf, addLog4j: Boolean, env: HashMap[String, String]) {
Expand Down

0 comments on commit 2409af9

Please sign in to comment.