Skip to content

Commit

Permalink
Cluster: master node now uses new protocol to vote.
Browse files Browse the repository at this point in the history
  • Loading branch information
antirez committed Sep 26, 2013
1 parent fb9b76f commit a445aa3
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 35 deletions.
69 changes: 34 additions & 35 deletions src/cluster.c
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ void clusterInit(void) {
server.cluster->failover_auth_time = 0;
server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_epoch = 0;
server.cluster->last_vote_epoch = 0;
memset(server.cluster->migrating_slots_to,0,
sizeof(server.cluster->migrating_slots_to));
memset(server.cluster->importing_slots_from,0,
Expand Down Expand Up @@ -396,6 +397,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
memset(node->ip,0,sizeof(node->ip));
node->port = 0;
node->fail_reports = listCreate();
node->voted_time = 0;
listSetFreeMethod(node->fail_reports,zfree);
return node;
}
Expand Down Expand Up @@ -1178,15 +1180,18 @@ int clusterProcessPacket(clusterLink *link) {
}
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST) {
if (!sender) return 1; /* We don't know that node. */
/* If we are not a master, ignore that message at all. */
if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return 0;
clusterSendFailoverAuthIfNeeded(sender,hdr);
} else if (type == CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK) {
if (!sender) return 1; /* We don't know that node. */
/* If this is a master, increment the number of acknowledges
* we received so far. */
if (sender->flags & REDIS_NODE_MASTER)
/* We consider this vote only if the sender if a master serving
* a non zero number of slots, with the currentEpoch that is equal
* to our currentEpoch. */
if (sender->flags & REDIS_NODE_MASTER &&
sender->numslots > 0 &&
senderCurrentEpoch == server.cluster->currentEpoch)
{
server.cluster->failover_auth_count++;
}
} else {
redisLog(REDIS_WARNING,"Received unknown packet type: %d", type);
}
Expand Down Expand Up @@ -1538,43 +1543,38 @@ void clusterSendFailoverAuth(clusterNode *node, uint64_t reqtime) {
clusterSendMessage(node->link,buf,totlen);
}

/* If we believe 'node' is the "first slave" of it's master, reply with
* a FAILOVER_AUTH_GRANTED packet.
* The 'request' field points to the authorization request packet header, we
* need it in order to copy back the 'time' field in our reply.
*
* To be a first slave the sender must:
* 1) Be a slave.
* 2) Its master should be in FAIL state.
* 3) Ordering all the slaves IDs for its master by run-id, it should be the
* first (the smallest) among the ones not in FAIL / PFAIL state.
*/
/* Vote for the node asking for our vote if there are the conditions. */
void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
char first[REDIS_CLUSTER_NAMELEN];
clusterNode *master = node->slaveof;
int j;
uint64_t requestEpoch = ntohu64(request->currentEpoch);

/* IF we are not a master serving at least 1 slot, we don't have the
* right to vote, as the cluster size in Redis Cluster is the number
* of masters serving at least one slot, and quorum is the cluster size + 1 */
if (!(server.cluster->myself->flags & REDIS_NODE_MASTER)) return;
if (server.cluster->myself->numslots == 0) return;

/* Request epoch must be >= our currentEpoch. */
if (requestEpoch < server.cluster->currentEpoch) return;

/* Node is a slave? Its master is down? */
/* I already voted for this epoch? Return ASAP. */
if (server.cluster->last_vote_epoch == server.cluster->currentEpoch) return;

/* Node must be a slave and its master down. */
if (!(node->flags & REDIS_NODE_SLAVE) ||
master == NULL ||
!(master->flags & REDIS_NODE_FAIL)) return;

/* Iterate all the master slaves to check what's the first one. */
memset(first,0xff,sizeof(first));
for (j = 0; j < master->numslaves; j++) {
clusterNode *slave = master->slaves[j];

if (slave->flags & (REDIS_NODE_FAIL|REDIS_NODE_PFAIL)) continue;
if (memcmp(slave->name,first,sizeof(first)) < 0) {
memcpy(first,slave->name,sizeof(first));
}
}

/* Is 'node' the first slave? */
if (memcmp(node->name,first,sizeof(first)) != 0) return;
/* We did not voted for a slave about this master for two
* times the node timeout. This is not strictly needed for correctness
* of the algorithm but makes the base case more linear. */
if (server.unixtime - node->slaveof->voted_time <
server.cluster_node_timeout * 2) return;

/* We can send the packet. */
/* We can vote for this slave. */
clusterSendFailoverAuth(node,request->time);
server.cluster->last_vote_epoch = server.cluster->currentEpoch;
node->slaveof->voted_time = server.unixtime;
}

/* This function is called if we are a slave node and our master serving
Expand All @@ -1583,8 +1583,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
* The gaol of this function is:
* 1) To check if we are able to perform a failover, is our data updated?
* 2) Try to get elected by masters.
* 3) Check if there is the majority of masters agreeing we should failover.
* 4) Perform the failover informing all the other nodes.
* 3) Perform the failover informing all the other nodes.
*/
void clusterHandleSlaveFailover(void) {
time_t data_age = server.unixtime - server.repl_down_since;
Expand Down
3 changes: 3 additions & 0 deletions src/redis.h
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,7 @@ struct clusterNode {
time_t ping_sent; /* Unix time we sent latest ping */
time_t pong_received; /* Unix time we received the pong */
time_t fail_time; /* Unix time when FAIL flag was set */
time_t voted_time; /* Last time we voted for a slave of this master */
char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */
int port; /* Latest known port of this node */
clusterLink *link; /* TCP/IP link with this node */
Expand All @@ -650,6 +651,8 @@ typedef struct {
int failover_auth_count; /* Number of votes received so far. */
int failover_auth_sent; /* True if we already asked for votes. */
uint64_t failover_auth_epoch; /* Epoch of the current election. */
/* The followign fields are uesd by masters to take state on elections. */
uint64_t last_vote_epoch; /* Epoch of the last vote granted. */
} clusterState;

/* Redis cluster messages header */
Expand Down

0 comments on commit a445aa3

Please sign in to comment.