Skip to content

Commit

Permalink
YARN-4243. Add retry on establishing Zookeeper conenction in Embedded…
Browse files Browse the repository at this point in the history
…ElectorService#serviceInit. Contributed by Xuan Gong.
  • Loading branch information
JunpingDu committed Oct 22, 2015
1 parent 960201b commit 0fce5f9
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,49 @@ static enum State {
*/
public ActiveStandbyElector(String zookeeperHostPorts,
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
List<ZKAuthInfo> authInfo,
ActiveStandbyElectorCallback app, int maxRetryNum) throws IOException,
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
int maxRetryNum) throws IOException, HadoopIllegalArgumentException,
KeeperException {
this(zookeeperHostPorts, zookeeperSessionTimeout, parentZnodeName, acl,
authInfo, app, maxRetryNum, true);
}

/**
* Create a new ActiveStandbyElector object <br/>
* The elector is created by providing to it the Zookeeper configuration, the
* parent znode under which to create the znode and a reference to the
* callback interface. <br/>
* The parent znode name must be the same for all service instances and
* different across services. <br/>
* After the leader has been lost, a new leader will be elected after the
* session timeout expires. Hence, the app must set this parameter based on
* its needs for failure response time. The session timeout must be greater
* than the Zookeeper disconnect timeout and is recommended to be 3X that
* value to enable Zookeeper to retry transient disconnections. Setting a very
* short session timeout may result in frequent transitions between active and
* standby states during issues like network outages/GS pauses.
*
* @param zookeeperHostPorts
* ZooKeeper hostPort for all ZooKeeper servers
* @param zookeeperSessionTimeout
* ZooKeeper session timeout
* @param parentZnodeName
* znode under which to create the lock
* @param acl
* ZooKeeper ACL's
* @param authInfo a list of authentication credentials to add to the
* ZK connection
* @param app
* reference to callback interface object
* @param failFast
* whether need to add the retry when establishing ZK connection.
* @throws IOException
* @throws HadoopIllegalArgumentException
*/
public ActiveStandbyElector(String zookeeperHostPorts,
int zookeeperSessionTimeout, String parentZnodeName, List<ACL> acl,
List<ZKAuthInfo> authInfo, ActiveStandbyElectorCallback app,
int maxRetryNum, boolean failFast) throws IOException,
HadoopIllegalArgumentException, KeeperException {
if (app == null || acl == null || parentZnodeName == null
|| zookeeperHostPorts == null || zookeeperSessionTimeout <= 0) {
Expand All @@ -225,8 +266,12 @@ public ActiveStandbyElector(String zookeeperHostPorts,
zkBreadCrumbPath = znodeWorkingDir + "/" + BREADCRUMB_FILENAME;
this.maxRetryNum = maxRetryNum;

// createConnection for future API calls
createConnection();
// establish the ZK Connection for future API calls
if (failFast) {
createConnection();
} else {
reEstablishSession();
}
}

/**
Expand Down
3 changes: 3 additions & 0 deletions hadoop-yarn-project/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,9 @@ Release 2.8.0 - UNRELEASED
YARN-3985. Make ReservationSystem persist state using RMStateStore
reservation APIs. (adhoot via asuresh)

YARN-4243. Add retry on establishing Zookeeper conenction in
EmbeddedElectorService#serviceInit. (Xuan Gong via junping_du)

OPTIMIZATIONS

YARN-3339. TestDockerContainerExecutor should pull a single image and not
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,10 @@ private static void addDeprecatedKeys() {
public static final int
DEFAULT_CLIENT_FAILOVER_RETRIES_ON_SOCKET_TIMEOUTS = 0;

/** number of zookeeper operation retry times in ActiveStandbyElector */
public static final String RM_HA_FC_ELECTOR_ZK_RETRIES_KEY = RM_HA_PREFIX
+ "failover-controller.active-standby-elector.zk.retries";

////////////////////////////////
// RM state store configs
////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,13 @@
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore</value>
</property>

<property>
<description>When automatic failover is enabled, number of zookeeper
operation retry times in ActiveStandbyElector</description>
<name>yarn.resourcemanager.ha.failover-controller.active-standby-elector.zk.retries</name>
<!--<value>3</value>-->
</property>

<property>
<description>The maximum number of completed applications RM state
store keeps, less than or equals to ${yarn.resourcemanager.max-completed-applications}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,12 @@ protected void serviceInit(Configuration conf)
List<ACL> zkAcls = RMZKUtils.getZKAcls(conf);
List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);

int maxRetryNum = conf.getInt(
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT);
int maxRetryNum =
conf.getInt(YarnConfiguration.RM_HA_FC_ELECTOR_ZK_RETRIES_KEY, conf
.getInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY,
CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_DEFAULT));
elector = new ActiveStandbyElector(zkQuorum, (int) zkSessionTimeout,
electionZNode, zkAcls, zkAuths, this, maxRetryNum);
electionZNode, zkAcls, zkAuths, this, maxRetryNum, false);

elector.ensureParentZNode();
if (!isParentZnodeSafe(clusterId)) {
Expand Down

0 comments on commit 0fce5f9

Please sign in to comment.