Skip to content

Commit becd4d8

Browse files
authored
Only use wait_maintenance to wait for wait_primary (#794)
Before we would use wait_maintenance together with join_primary. This has no function anymore with the current code. It also fixes a few issues with the existing logic regarding when wait_maintenance is used in combination with wait_primary.
1 parent ac0b3b1 commit becd4d8

File tree

5 files changed

+62
-52
lines changed

5 files changed

+62
-52
lines changed

src/bin/pg_autoctl/fsm.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,8 @@ KeeperFSMTransition KeeperFSM[] = {
340340
*/
341341
{ SECONDARY_STATE, WAIT_MAINTENANCE_STATE, COMMENT_SECONDARY_TO_WAIT_MAINTENANCE, NULL },
342342
{ CATCHINGUP_STATE, WAIT_MAINTENANCE_STATE, COMMENT_SECONDARY_TO_WAIT_MAINTENANCE, NULL },
343+
{ SECONDARY_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby },
344+
{ CATCHINGUP_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby },
343345
{ WAIT_MAINTENANCE_STATE, MAINTENANCE_STATE, COMMENT_SECONDARY_TO_MAINTENANCE, &fsm_start_maintenance_on_standby },
344346
{ MAINTENANCE_STATE, CATCHINGUP_STATE, COMMENT_MAINTENANCE_TO_CATCHINGUP, &fsm_restart_standby },
345347
{ PREPARE_MAINTENANCE_STATE, CATCHINGUP_STATE, COMMENT_MAINTENANCE_TO_CATCHINGUP, &fsm_restart_standby },

src/monitor/group_state_machine.c

Lines changed: 8 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -498,31 +498,28 @@ ProceedGroupState(AutoFailoverNode *activeNode)
498498
}
499499

500500
/*
501-
* when secondary is put to maintenance and we have more standby nodes
502-
* wait_maintenance -> maintenance
503-
* join_primary -> primary
501+
* when secondary is in wait_maintenance state and goal state of primary is
502+
* not wait_primary anymore, e.g. another node joined and made it primary
503+
* again or it got demoted. Then we don't need to wait anymore and we can
504+
* transition directly to maintenance.
504505
*/
505506
if (IsCurrentState(activeNode, REPLICATION_STATE_WAIT_MAINTENANCE) &&
506-
primaryNode->reportedState == REPLICATION_STATE_JOIN_PRIMARY &&
507-
(primaryNode->goalState == REPLICATION_STATE_JOIN_PRIMARY ||
508-
primaryNode->goalState == REPLICATION_STATE_PRIMARY))
507+
primaryNode->goalState != REPLICATION_STATE_WAIT_PRIMARY)
509508
{
510509
char message[BUFSIZE];
511510

512511
LogAndNotifyMessage(
513512
message, BUFSIZE,
514513
"Setting goal state of " NODE_FORMAT
515514
" to maintenance after " NODE_FORMAT
516-
" converged to wait_primary.",
515+
" got assigned %s as goal state.",
517516
NODE_FORMAT_ARGS(activeNode),
518-
NODE_FORMAT_ARGS(primaryNode));
517+
NODE_FORMAT_ARGS(primaryNode),
518+
ReplicationStateGetName(primaryNode->goalState));
519519

520520
/* secondary reached maintenance */
521521
AssignGoalState(activeNode, REPLICATION_STATE_MAINTENANCE, message);
522522

523-
/* set the primary back to its normal state (we can failover still) */
524-
AssignGoalState(primaryNode, REPLICATION_STATE_PRIMARY, message);
525-
526523
return true;
527524
}
528525

@@ -1221,16 +1218,6 @@ ProceedGroupStateForPrimaryNode(AutoFailoverNode *primaryNode)
12211218
{
12221219
AutoFailoverNode *otherNode = (AutoFailoverNode *) lfirst(nodeCell);
12231220

1224-
/*
1225-
* Prevent the transition to primary when we have nodes in
1226-
* wait_maintenance.
1227-
*/
1228-
if (otherNode->goalState == REPLICATION_STATE_WAIT_MAINTENANCE)
1229-
{
1230-
allSecondariesAreHealthy = false;
1231-
break;
1232-
}
1233-
12341221
/*
12351222
* Skip nodes that are not failover candidates, and avoid ping-pong
12361223
* bewtween JOIN_PRIMARY and PRIMARY while setting up a node

src/monitor/node_active_protocol.c

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1679,7 +1679,8 @@ start_maintenance(PG_FUNCTION_ARGS)
16791679
int secondaryNodesCount = CountHealthySyncStandbys(secondaryNodesList);
16801680

16811681
if (formation->number_sync_standbys > 0 &&
1682-
secondaryNodesCount <= formation->number_sync_standbys)
1682+
secondaryNodesCount <= formation->number_sync_standbys &&
1683+
IsHealthySyncStandby(currentNode))
16831684
{
16841685
ereport(WARNING,
16851686
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
@@ -1691,12 +1692,7 @@ start_maintenance(PG_FUNCTION_ARGS)
16911692
"healthy node(s) left in the \"secondary\" state "
16921693
"and formation \"%s\" number-sync-standbys requires "
16931694
"%d sync standbys",
1694-
1695-
/*
1696-
* We might double count a standby node when put to
1697-
* maintenance and e.g. already unhealthy.
1698-
*/
1699-
secondaryNodesCount > 0 ? secondaryNodesCount - 1 : 0,
1695+
secondaryNodesCount - 1,
17001696
formation->formationId,
17011697
formation->number_sync_standbys)));
17021698
}
@@ -1793,28 +1789,40 @@ start_maintenance(PG_FUNCTION_ARGS)
17931789
IsCurrentState(primaryNode, REPLICATION_STATE_PRIMARY))
17941790
{
17951791
/*
1796-
* When putting the last secondary node to maintenance, we disable sync
1797-
* rep on the primary by switching it to wait_primary. Because we
1798-
* didn't change the state of any standby node yet, we get there when
1799-
* the count is one (not zero).
1792+
* In most cases we can simply put a secondary directly into
1793+
* maintenance mode. However, when putting the last secondary node
1794+
* that's part of the replication quorum to maintenance, we disable
1795+
* sync rep on the primary by switching it to wait_primary. Otherwise
1796+
* the primary won't be able to accept writes until the monitor assigns
1797+
* it wait_primary. This way we're nice about it and don't bring the
1798+
* secondary down before that happens. Because we didn't change the
1799+
* state of any standby node yet, we get there when the count is one
1800+
* (not zero).
18001801
*/
1801-
ReplicationState primaryGoalState =
1802-
secondaryNodesCount == 1 && formation->number_sync_standbys == 0
1803-
? REPLICATION_STATE_WAIT_PRIMARY
1804-
: REPLICATION_STATE_JOIN_PRIMARY;
1805-
1806-
LogAndNotifyMessage(
1807-
message, BUFSIZE,
1808-
"Setting goal state of " NODE_FORMAT
1809-
" to %s and " NODE_FORMAT
1810-
" to wait_maintenance "
1811-
"after a user-initiated start_maintenance call.",
1812-
NODE_FORMAT_ARGS(primaryNode),
1813-
ReplicationStateGetName(primaryGoalState),
1814-
NODE_FORMAT_ARGS(currentNode));
1815-
1816-
SetNodeGoalState(primaryNode, primaryGoalState, message);
1817-
SetNodeGoalState(currentNode, REPLICATION_STATE_WAIT_MAINTENANCE, message);
1802+
if (formation->number_sync_standbys == 0 && secondaryNodesCount == 1 &&
1803+
IsHealthySyncStandby(currentNode))
1804+
{
1805+
LogAndNotifyMessage(
1806+
message, BUFSIZE,
1807+
"Setting goal state of " NODE_FORMAT
1808+
" to wait_primary and " NODE_FORMAT
1809+
" to wait_maintenance "
1810+
"after a user-initiated start_maintenance call.",
1811+
NODE_FORMAT_ARGS(primaryNode),
1812+
NODE_FORMAT_ARGS(currentNode));
1813+
SetNodeGoalState(primaryNode, REPLICATION_STATE_WAIT_PRIMARY, message);
1814+
SetNodeGoalState(currentNode, REPLICATION_STATE_WAIT_MAINTENANCE, message);
1815+
}
1816+
else
1817+
{
1818+
LogAndNotifyMessage(
1819+
message, BUFSIZE,
1820+
"Setting goal state of " NODE_FORMAT
1821+
" to maintenance "
1822+
"after a user-initiated start_maintenance call.",
1823+
NODE_FORMAT_ARGS(currentNode));
1824+
SetNodeGoalState(currentNode, REPLICATION_STATE_MAINTENANCE, message);
1825+
}
18181826
}
18191827
else
18201828
{

src/monitor/node_metadata.c

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -869,6 +869,20 @@ CountSyncStandbys(List *groupNodeList)
869869
}
870870

871871

872+
/*
873+
* IsHealthySyncStandby returns true if the node its replicationQuorum property
874+
* set to true in the given groupNodeList, but only if only if that node is
875+
* currently currently in REPLICATION_STATE_SECONDARY and known healthy.
876+
*/
877+
bool
878+
IsHealthySyncStandby(AutoFailoverNode *node)
879+
{
880+
return node->replicationQuorum &&
881+
IsCurrentState(node, REPLICATION_STATE_SECONDARY) &&
882+
IsHealthy(node);
883+
}
884+
885+
872886
/*
873887
* CountHealthySyncStandbys returns how many standby nodes have their
874888
* replicationQuorum property set to true in the given groupNodeList, counting
@@ -885,9 +899,7 @@ CountHealthySyncStandbys(List *groupNodeList)
885899
{
886900
AutoFailoverNode *node = (AutoFailoverNode *) lfirst(nodeCell);
887901

888-
if (node->replicationQuorum &&
889-
IsCurrentState(node, REPLICATION_STATE_SECONDARY) &&
890-
IsHealthy(node))
902+
if (IsHealthySyncStandby(node))
891903
{
892904
++count;
893905
}

src/monitor/node_metadata.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ extern List * ListMostAdvancedStandbyNodes(List *groupNodeList);
169169
extern List * GroupListSyncStandbys(List *groupNodeList);
170170
extern bool AllNodesHaveSameCandidatePriority(List *groupNodeList);
171171
extern int CountSyncStandbys(List *groupNodeList);
172+
extern bool IsHealthySyncStandby(AutoFailoverNode *node);
172173
extern int CountHealthySyncStandbys(List *groupNodeList);
173174
extern int CountHealthyCandidates(List *groupNodeList);
174175
extern bool IsFailoverInProgress(List *groupNodeList);

0 commit comments

Comments
 (0)