Only use wait_maintenance to wait for wait_primary (#794)

JelteF · web-flow · commit becd4d8733cd · 2021-09-03T16:21:50.000+02:00
Before we would use wait_maintenance together with join_primary. This has no
function anymore with the current code. It also fixes a few issues with the 
existing logic regarding when wait_maintenance is used in combination with 
wait_primary.
diff --git a/src/bin/pg_autoctl/fsm.c b/src/bin/pg_autoctl/fsm.c
@@ -340,6 +340,8 @@ KeeperFSMTransition KeeperFSM[] = {
 	 */
 	{ SECONDARY_STATE, WAIT_MAINTENANCE_STATE, COMMENT_SECONDARY_TO_WAIT_MAINTENANCE, NULL },
 	{ CATCHINGUP_STATE, WAIT_MAINTENANCE_STATE, COMMENT_SECONDARY_TO_WAIT_MAINTENANCE, NULL },
+	{ SECONDARY_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby },
+	{ CATCHINGUP_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby },
 	{ WAIT_MAINTENANCE_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby },
 	{ MAINTENANCE_STATE, CATCHINGUP_STATE, COMMENT_MAINTENANCE_TO_CATCHINGUP, &fsm_restart_standby },
 	{ PREPARE_MAINTENANCE_STATE, CATCHINGUP_STATE, COMMENT_MAINTENANCE_TO_CATCHINGUP, &fsm_restart_standby },
diff --git a/src/monitor/group_state_machine.c b/src/monitor/group_state_machine.c
@@ -498,31 +498,28 @@ ProceedGroupState(AutoFailoverNode *activeNode)
 	}
 
 	/*
-	 * when secondary is put to maintenance and we have more standby nodes
-	 *  wait_maintenance -> maintenance
-	 *  join_primary -> primary
+	 * when secondary is in wait_maintenance state and goal state of primary is
+	 * not wait_primary anymore, e.g. another node joined and made it primary
+	 * again or it got demoted. Then we don't need to wait anymore and we can
+	 * transition directly to maintenance.
 	 */
 	if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_MAINTENANCE) &&
-		primaryNode->reportedState == REPLICATION_STATE_JOIN_PRIMARY &&
-		(primaryNode->goalState == REPLICATION_STATE_JOIN_PRIMARY ||
-		 primaryNode->goalState == REPLICATION_STATE_PRIMARY))
+		primaryNode->goalState != REPLICATION_STATE_WAIT_PRIMARY)
 	{
 		char message[BUFSIZE];
 
 		LogAndNotifyMessage(
 			message, BUFSIZE,
 			"Setting goal state of " NODE_FORMAT
 			" to maintenance after " NODE_FORMAT
-			" converged to wait_primary.",
+			" got assigned %s as goal state.",
 			NODE_FORMAT_ARGS(activeNode),
-			NODE_FORMAT_ARGS(primaryNode));
+			NODE_FORMAT_ARGS(primaryNode),
+			ReplicationStateGetName(primaryNode->goalState));
 
 		/* secondary reached maintenance */
 		AssignGoalState(activeNode, REPLICATION_STATE_MAINTENANCE, message);
 
-		/* set the primary back to its normal state (we can failover still) */
-		AssignGoalState(primaryNode, REPLICATION_STATE_PRIMARY, message);
-
 		return true;
 	}
 
@@ -1221,16 +1218,6 @@ ProceedGroupStateForPrimaryNode(AutoFailoverNode *primaryNode)
 		{
 			AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell);
 
-			/*
-			 * Prevent the transition to primary when we have nodes in
-			 * wait_maintenance.
-			 */
-			if (otherNode->goalState == REPLICATION_STATE_WAIT_MAINTENANCE)
-			{
-				allSecondariesAreHealthy = false;
-				break;
-			}
-
 			/*
 			 * Skip nodes that are not failover candidates, and avoid ping-pong
 			 * bewtween JOIN_PRIMARY and PRIMARY while setting up a node
diff --git a/src/monitor/node_active_protocol.c b/src/monitor/node_active_protocol.c
@@ -1679,7 +1679,8 @@ start_maintenance(PG_FUNCTION_ARGS)
 	int secondaryNodesCount = CountHealthySyncStandbys(secondaryNodesList);
 
 	if (formation->number_sync_standbys > 0 &&
-		secondaryNodesCount <= formation->number_sync_standbys)
+		secondaryNodesCount <= formation->number_sync_standbys &&
+		IsHealthySyncStandby(currentNode))
 	{
 		ereport(WARNING,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -1691,12 +1692,7 @@ start_maintenance(PG_FUNCTION_ARGS)
 						   "healthy node(s) left in the \"secondary\" state "
 						   "and formation \"%s\" number-sync-standbys requires "
 						   "%d sync standbys",
-
-		                   /*
-		                    * We might double count a standby node when put to
-		                    * maintenance and e.g. already unhealthy.
-		                    */
-						   secondaryNodesCount > 0 ? secondaryNodesCount - 1 : 0,
+						   secondaryNodesCount - 1,
 						   formation->formationId,
 						   formation->number_sync_standbys)));
 	}
@@ -1793,28 +1789,40 @@ start_maintenance(PG_FUNCTION_ARGS)
 			 IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY))
 	{
 		/*
-		 * When putting the last secondary node to maintenance, we disable sync
-		 * rep on the primary by switching it to wait_primary. Because we
-		 * didn't change the state of any standby node yet, we get there when
-		 * the count is one (not zero).
+		 * In most cases we can simply put a secondary directly into
+		 * maintenance mode. However, when putting the last secondary node
+		 * that's part of the replication quorum to maintenance, we disable
+		 * sync rep on the primary by switching it to wait_primary. Otherwise
+		 * the primary won't be able to accept writes until the monitor assigns
+		 * it wait_primary. This way we're nice about it and don't bring the
+		 * secondary down before that happens. Because we didn't change the
+		 * state of any standby node yet, we get there when the count is one
+		 * (not zero).
 		 */
-		ReplicationState primaryGoalState =
-			secondaryNodesCount == 1 && formation->number_sync_standbys == 0
-			? REPLICATION_STATE_WAIT_PRIMARY
-			: REPLICATION_STATE_JOIN_PRIMARY;
-
-		LogAndNotifyMessage(
-			message, BUFSIZE,
-			"Setting goal state of " NODE_FORMAT
-			" to %s and " NODE_FORMAT
-			" to wait_maintenance "
-			"after a user-initiated start_maintenance call.",
-			NODE_FORMAT_ARGS(primaryNode),
-			ReplicationStateGetName(primaryGoalState),
-			NODE_FORMAT_ARGS(currentNode));
-
-		SetNodeGoalState(primaryNode, primaryGoalState, message);
-		SetNodeGoalState(currentNode, REPLICATION_STATE_WAIT_MAINTENANCE, message);
+		if (formation->number_sync_standbys == 0 && secondaryNodesCount == 1 &&
+			IsHealthySyncStandby(currentNode))
+		{
+			LogAndNotifyMessage(
+				message, BUFSIZE,
+				"Setting goal state of " NODE_FORMAT
+				" to wait_primary and " NODE_FORMAT
+				" to wait_maintenance "
+				"after a user-initiated start_maintenance call.",
+				NODE_FORMAT_ARGS(primaryNode),
+				NODE_FORMAT_ARGS(currentNode));
+			SetNodeGoalState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY, message);
+			SetNodeGoalState(currentNode, REPLICATION_STATE_WAIT_MAINTENANCE, message);
+		}
+		else
+		{
+			LogAndNotifyMessage(
+				message, BUFSIZE,
+				"Setting goal state of " NODE_FORMAT
+				" to maintenance "
+				"after a user-initiated start_maintenance call.",
+				NODE_FORMAT_ARGS(currentNode));
+			SetNodeGoalState(currentNode, REPLICATION_STATE_MAINTENANCE, message);
+		}
 	}
 	else
 	{
diff --git a/src/monitor/node_metadata.c b/src/monitor/node_metadata.c
@@ -869,6 +869,20 @@ CountSyncStandbys(List *groupNodeList)
 }
 
 
+/*
+ * IsHealthySyncStandby returns true if the node its replicationQuorum property
+ * set to true in the given groupNodeList, but only if only if that node is
+ * currently currently in REPLICATION_STATE_SECONDARY and known healthy.
+ */
+bool
+IsHealthySyncStandby(AutoFailoverNode *node)
+{
+	return node->replicationQuorum &&
+		   IsCurrentState(node, REPLICATION_STATE_SECONDARY) &&
+		   IsHealthy(node);
+}
+
+
 /*
  * CountHealthySyncStandbys returns how many standby nodes have their
  * replicationQuorum property set to true in the given groupNodeList, counting
@@ -885,9 +899,7 @@ CountHealthySyncStandbys(List *groupNodeList)
 	{
 		AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell);
 
-		if (node->replicationQuorum &&
-			IsCurrentState(node, REPLICATION_STATE_SECONDARY) &&
-			IsHealthy(node))
+		if (IsHealthySyncStandby(node))
 		{
 			++count;
 		}
diff --git a/src/monitor/node_metadata.h b/src/monitor/node_metadata.h
@@ -169,6 +169,7 @@ extern List * ListMostAdvancedStandbyNodes(List *groupNodeList);
 extern List * GroupListSyncStandbys(List *groupNodeList);
 extern bool AllNodesHaveSameCandidatePriority(List *groupNodeList);
 extern int CountSyncStandbys(List *groupNodeList);
+extern bool IsHealthySyncStandby(AutoFailoverNode *node);
 extern int CountHealthySyncStandbys(List *groupNodeList);
 extern int CountHealthyCandidates(List *groupNodeList);
 extern bool IsFailoverInProgress(List *groupNodeList);