Skip to content

Commit 814567e

Browse files
authored
Review pg_autoctl show state output, and docs. (#617)
* Review the FSM diagram. Make it obvious that we have more than one state where the application can successfully connect to a read-write node. * Review pg_autoctl show state output. Make it clear which kind of connection is given by which node, in particular that when in the SINGLE or WAIT_PRIMARY state we are still delivering a read-write connection to the application. The "Reachable" column is now a "Connection" column and the node health as seen by the monitor is encoded in the single-char *, !, or ?, depending on the health value (1, 0, -1). * Per review, simplify indications when it's all good.
1 parent 2437111 commit 814567e

6 files changed

Lines changed: 536 additions & 377 deletions

File tree

docs/failover-state-machine.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ all the other nodes.
6565

6666
pg_auto_failover Finite State Machine diagram
6767

68+
In the previous diagram we can see that we have a list of six states where
69+
the application can connect to a read-write Postgres service: ``single``,
70+
``wait_primary``, ``primary``, ``prepare_maintenance``, ``apply_settings``,
71+
and ``join_primary``.
72+
6873
Init
6974
^^^^
7075

docs/fsm.png

-6.46 KB
Loading

docs/tikz/fsm.svg

Lines changed: 415 additions & 364 deletions
Loading

docs/tikz/fsm.tex

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,11 @@
3939
nodes={text height=.7em, text depth=.2em,
4040
draw=black!20, thick, fill=blue!20, font=\footnotesize}]
4141
{
42-
init [circle,draw,x=-10,y=10,color=white,fill=black!70] -> single [circle,draw=black,color=white,fill=black!50];
42+
init [circle,draw,x=-10,y=10,color=white,fill=black!70] -> single [font=\normalsize, draw=black, circle];
4343
init -> "wait standby" [fill=violet!30];
44-
single -> "wait primary" ;
45-
"wait primary" -> primary [circle,draw=black,circle];
46-
"join primary" -> primary;
44+
single -> "wait primary" [font=\normalsize, draw=black];
45+
"wait primary" -> primary [font=\normalsize, draw=black, circle];
46+
"join primary" [font=\normalsize, draw=black] -> primary;
4747
"wait standby" -> "catching up" [fill=violet!30];
4848
"catching up" -> secondary [circle,draw=black,circle,fill=violet!30];
4949
primary -> "join primary" ;
@@ -59,14 +59,14 @@
5959
draining -> "demote timeout" ;
6060
"demote timeout" -> demoted ;
6161

62-
primary -> "apply settings" [fill=green!20];
62+
primary -> "apply settings" [font=\normalsize, draw=black];
6363
"apply settings" -> primary ;
6464
"apply settings" -> draining ;
6565
"apply settings" -> demoted ;
6666
"apply settings" -> "demote timeout" ;
6767
"apply settings" -> primary ;
6868

69-
primary -> "prepare maintenance" [fill=blue!10];
69+
primary -> "prepare maintenance" [font=\normalsize, draw=black];
7070
"prepare maintenance" -> maintenance [fill=black!20];
7171
secondary -> "wait maintenance" [fill=violet!20];
7272
"catching up" -> "wait maintenance" ;

src/bin/pg_autoctl/nodestate_utils.c

Lines changed: 108 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ nodestatePrepareHeaders(CurrentNodeStateArray *nodesArray,
3030
nodesArray->headers.maxNodeSize = 5; /* "Node" */
3131
nodesArray->headers.maxLSNSize = 3; /* "LSN" */
3232
nodesArray->headers.maxStateSize = MAX_NODE_STATE_LEN;
33-
nodesArray->headers.maxHealthSize = strlen("Reachable");
33+
nodesArray->headers.maxHealthSize = strlen("read-write *");
3434

3535
/*
3636
* Dynamically adjust our display output to the length of the longer
@@ -184,11 +184,11 @@ nodestateAdjustHeaders(NodeAddressHeaders *headers,
184184
if (headers->maxHealthSize == 0)
185185
{
186186
/*
187-
* Reachable, which is longer than "unknown", "yes", and "no", so
188-
* that's all we use here, a static length actually. Which is good,
189-
* because a NodeAddress does not know its own health anyway.
187+
* Connection is one of "read-only", "read-write", or "unknown",
188+
* followed by a mark for the health check (*, !, or ?), so we need as
189+
* much space as the full sample "read-write *":
190190
*/
191-
headers->maxHealthSize = strlen("Reachable");
191+
headers->maxHealthSize = strlen("read-write *");
192192
}
193193

194194
if (nameLen > headers->maxNameSize)
@@ -224,7 +224,7 @@ nodestatePrintHeader(NodeAddressHeaders *headers)
224224
headers->maxNodeSize, "Node",
225225
headers->maxHostSize, "Host:Port",
226226
headers->maxLSNSize, "LSN",
227-
headers->maxHealthSize, "Reachable",
227+
headers->maxHealthSize, "Connection",
228228
headers->maxStateSize, "Current State",
229229
headers->maxStateSize, "Assigned State");
230230

@@ -249,19 +249,31 @@ nodestatePrintNodeState(NodeAddressHeaders *headers,
249249
{
250250
char hostport[BUFSIZE] = { 0 };
251251
char composedId[BUFSIZE] = { 0 };
252+
char connection[BUFSIZE] = { 0 };
253+
char healthChar = nodestateHealthToChar(nodeState->health);
252254

253255
(void) nodestatePrepareNode(headers,
254256
&(nodeState->node),
255257
nodeState->groupId,
256258
hostport,
257259
composedId);
258260

261+
if (healthChar == ' ')
262+
{
263+
sformat(connection, BUFSIZE, "%s", nodestateConnectionType(nodeState));
264+
}
265+
else
266+
{
267+
sformat(connection, BUFSIZE, "%s %c",
268+
nodestateConnectionType(nodeState), healthChar);
269+
}
270+
259271
fformat(stdout, "%*s | %*s | %*s | %*s | %*s | %*s | %*s\n",
260272
headers->maxNameSize, nodeState->node.name,
261273
headers->maxNodeSize, composedId,
262274
headers->maxHostSize, hostport,
263275
headers->maxLSNSize, nodeState->node.lsn,
264-
headers->maxHealthSize, nodestateHealthToString(nodeState->health),
276+
headers->maxHealthSize, connection,
265277
headers->maxStateSize, NodeStateToString(nodeState->reportedState),
266278
headers->maxStateSize, NodeStateToString(nodeState->goalState));
267279
}
@@ -346,6 +358,9 @@ nodestateAsJSON(CurrentNodeState *nodeState, JSON_Value *js)
346358
json_object_set_string(jsobj, "reachable",
347359
nodestateHealthToString(nodeState->health));
348360

361+
json_object_set_string(jsobj, "conntype",
362+
nodestateConnectionType(nodeState));
363+
349364
return true;
350365
}
351366

@@ -382,6 +397,92 @@ nodestateHealthToString(int health)
382397
}
383398

384399

400+
/*
401+
* Transform the health column from a monitor into a single char.
402+
*/
403+
char
404+
nodestateHealthToChar(int health)
405+
{
406+
switch (health)
407+
{
408+
case -1:
409+
{
410+
return '?';
411+
}
412+
413+
case 0:
414+
{
415+
return '!';
416+
}
417+
418+
case 1:
419+
{
420+
return ' ';
421+
}
422+
423+
default:
424+
{
425+
log_error("BUG in nodestateHealthToString: health = %d", health);
426+
return '-';
427+
}
428+
}
429+
}
430+
431+
432+
/*
433+
* nodestateConnectionType returns one of "read-write" or "read-only".
434+
*/
435+
char *
436+
nodestateConnectionType(CurrentNodeState *nodeState)
437+
{
438+
switch (nodeState->reportedState)
439+
{
440+
case SINGLE_STATE:
441+
case PRIMARY_STATE:
442+
case WAIT_PRIMARY_STATE:
443+
case JOIN_PRIMARY_STATE:
444+
case PREPARE_MAINTENANCE_STATE:
445+
case APPLY_SETTINGS_STATE:
446+
{
447+
return "read-write";
448+
}
449+
450+
case SECONDARY_STATE:
451+
case CATCHINGUP_STATE:
452+
case PREP_PROMOTION_STATE:
453+
case STOP_REPLICATION_STATE:
454+
case WAIT_MAINTENANCE_STATE:
455+
case FAST_FORWARD_STATE:
456+
case JOIN_SECONDARY_STATE:
457+
{
458+
return "read-only";
459+
}
460+
461+
/* in those states Postgres is known to be stopped/down */
462+
case NO_STATE:
463+
case INIT_STATE:
464+
case WAIT_STANDBY_STATE:
465+
case DEMOTED_STATE:
466+
case DEMOTE_TIMEOUT_STATE:
467+
case DRAINING_STATE:
468+
case MAINTENANCE_STATE:
469+
case REPORT_LSN_STATE:
470+
{
471+
return "none";
472+
}
473+
474+
case ANY_STATE:
475+
{
476+
return "unknown";
477+
}
478+
479+
/* default: is intentionally left out to have compiler check */
480+
}
481+
482+
return "unknown";
483+
}
484+
485+
385486
/*
386487
* nodestate_log logs a CurrentNodeState, usually that comes from a
387488
* notification message we parse.

src/bin/pg_autoctl/nodestate_utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ void prepareHostNameSeparator(char nameSeparatorHeader[], int size);
8989
bool nodestateAsJSON(CurrentNodeState *nodeState, JSON_Value *js);
9090

9191
char * nodestateHealthToString(int health);
92+
char nodestateHealthToChar(int health);
93+
char * nodestateConnectionType(CurrentNodeState *nodeState);
9294
void nodestate_log(CurrentNodeState *nodeState, int logLevel, int nodeId);
9395

9496
void printNodeArray(NodeAddressArray *nodesArray);

0 commit comments

Comments
 (0)