Skip to content

Commit 5304068

Browse files
authored
Have pg_autoctl drop node comment wait until the node has been removed. (#748)
1 parent a88ded0 commit 5304068

5 files changed

Lines changed: 237 additions & 23 deletions

File tree

docs/ref/pg_autoctl_drop_node.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ This command drops a Postgres node from the pg_auto_failover monitor::
2020
--pgport drop the node with given hostname and pgport
2121
--destroy also destroy Postgres database
2222
--force force dropping the node from the monitor
23+
--wait how many seconds to wait, default to 60
2324

2425
Description
2526
-----------
@@ -89,6 +90,14 @@ Options
8990
possible to use the option ``--force`` to immediately remove the node from
9091
the monitor.
9192

93+
--wait
94+
95+
How many seconds to wait for the node to be dropped entirely. The command
96+
stops when the target node is not to be found on the monitor anymore, or
97+
when the timeout has elapsed, whichever comes first. The value 0 (zero)
98+
disables the timeout and disables waiting entirely, making the command
99+
async.
100+
92101
Examples
93102
--------
94103

src/bin/pg_autoctl/cli_drop_node.c

Lines changed: 115 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@ static int cli_drop_node_getopts(int argc, char **argv);
5252
static void cli_drop_node(int argc, char **argv);
5353
static void cli_drop_monitor(int argc, char **argv);
5454

55-
static void cli_drop_node_from_monitor(KeeperConfig *config);
55+
static void cli_drop_node_from_monitor(KeeperConfig *config,
56+
int64_t *nodeId,
57+
int *groupId);
58+
5659
static void cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy);
5760
static void cli_drop_local_monitor(MonitorConfig *mconfig, bool dropAndDestroy);
5861

@@ -62,6 +65,7 @@ static void cli_drop_node_files_and_directories(KeeperConfig *config);
6265
static void stop_postgres_and_remove_pgdata_and_config(ConfigFilePaths *pathnames,
6366
PostgresSetup *pgSetup);
6467

68+
static void cli_drop_node_from_monitor_and_wait(KeeperConfig *config);
6569

6670
CommandLine drop_monitor_command =
6771
make_command("monitor",
@@ -85,7 +89,8 @@ CommandLine drop_node_command =
8589
" --hostname drop the node with given hostname and pgport\n"
8690
" --pgport drop the node with given hostname and pgport\n"
8791
" --destroy also destroy Postgres database\n"
88-
" --force force dropping the node from the monitor\n",
92+
" --force force dropping the node from the monitor\n"
93+
" --wait how many seconds to wait, default to 60 \n",
8994
cli_drop_node_getopts,
9095
cli_drop_node);
9196

@@ -108,6 +113,7 @@ cli_drop_node_getopts(int argc, char **argv)
108113
{ "hostname", required_argument, NULL, 'n' },
109114
{ "pgport", required_argument, NULL, 'p' },
110115
{ "formation", required_argument, NULL, 'f' },
116+
{ "wait", required_argument, NULL, 'w' },
111117
{ "name", required_argument, NULL, 'a' },
112118
{ "version", no_argument, NULL, 'V' },
113119
{ "verbose", no_argument, NULL, 'v' },
@@ -118,6 +124,9 @@ cli_drop_node_getopts(int argc, char **argv)
118124

119125
optind = 0;
120126

127+
options.listen_notifications_timeout =
128+
PG_AUTOCTL_LISTEN_NOTIFICATIONS_TIMEOUT;
129+
121130
while ((c = getopt_long(argc, argv, "D:dn:p:Vvqh",
122131
long_options, &option_index)) != -1)
123132
{
@@ -191,6 +200,19 @@ cli_drop_node_getopts(int argc, char **argv)
191200
break;
192201
}
193202

203+
case 'w':
204+
{
205+
/* { "wait", required_argument, NULL, 'w' }, */
206+
if (!stringToInt(optarg, &options.listen_notifications_timeout))
207+
{
208+
log_fatal("--wait argument is not a valid timeout: \"%s\"",
209+
optarg);
210+
exit(EXIT_CODE_BAD_ARGS);
211+
}
212+
log_trace("--wait %d", options.listen_notifications_timeout);
213+
break;
214+
}
215+
194216
case 'V':
195217
{
196218
/* keeper_cli_print_version prints version and exits. */
@@ -395,7 +417,7 @@ cli_drop_node(int argc, char **argv)
395417
exit(EXIT_CODE_BAD_ARGS);
396418
}
397419

398-
(void) cli_drop_node_from_monitor(&config);
420+
(void) cli_drop_node_from_monitor_and_wait(&config);
399421
}
400422
}
401423

@@ -483,7 +505,7 @@ cli_drop_monitor(int argc, char **argv)
483505
* --name.
484506
*/
485507
static void
486-
cli_drop_node_from_monitor(KeeperConfig *config)
508+
cli_drop_node_from_monitor(KeeperConfig *config, int64_t *nodeId, int *groupId)
487509
{
488510
Monitor monitor = { 0 };
489511

@@ -498,7 +520,9 @@ cli_drop_node_from_monitor(KeeperConfig *config)
498520
if (!monitor_remove_by_nodename(&monitor,
499521
(char *) config->formation,
500522
(char *) config->name,
501-
dropForce))
523+
dropForce,
524+
nodeId,
525+
groupId))
502526
{
503527
/* errors have already been logged */
504528
exit(EXIT_CODE_MONITOR);
@@ -518,7 +542,9 @@ cli_drop_node_from_monitor(KeeperConfig *config)
518542
if (!monitor_remove_by_hostname(&monitor,
519543
(char *) config->hostname,
520544
pgport,
521-
dropForce))
545+
dropForce,
546+
nodeId,
547+
groupId))
522548
{
523549
/* errors have already been logged */
524550
exit(EXIT_CODE_MONITOR);
@@ -569,7 +595,10 @@ cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy)
569595
/* first drop the node from the monitor */
570596
if (keeperState->assigned_role != DROPPED_STATE)
571597
{
572-
(void) cli_drop_node_from_monitor(config);
598+
int64_t nodeId = -1;
599+
int groupId = -1;
600+
601+
(void) cli_drop_node_from_monitor(config, &nodeId, &groupId);
573602
}
574603

575604
/*
@@ -851,3 +880,82 @@ stop_postgres_and_remove_pgdata_and_config(ConfigFilePaths *pathnames,
851880
exit(EXIT_CODE_BAD_CONFIG);
852881
}
853882
}
883+
884+
885+
/*
886+
* cli_drop_node_from_monitor_and_wait waits until the node doesn't exist
887+
* anymore on the monitor, meaning it's been fully dropped now.
888+
*/
889+
static void
890+
cli_drop_node_from_monitor_and_wait(KeeperConfig *config)
891+
{
892+
bool dropped = false;
893+
Monitor monitor = { 0 };
894+
895+
(void) cli_monitor_init_from_option_or_config(&monitor, config);
896+
897+
/* call pgautofailover.remove_node() on the monitor */
898+
int64_t nodeId;
899+
int groupId;
900+
901+
(void) cli_drop_node_from_monitor(config, &nodeId, &groupId);
902+
903+
/* if the timeout is zero, just don't wait at all */
904+
if (config->listen_notifications_timeout == 0)
905+
{
906+
return;
907+
}
908+
909+
log_info("Waiting until the node with id %lld in group %d has been "
910+
"dropped from the monitor, or for %ds, whichever comes first",
911+
(long long) nodeId, groupId, config->listen_notifications_timeout);
912+
913+
uint64_t start = time(NULL);
914+
915+
/* establish a connection for notifications if none present */
916+
(void) pgsql_prepare_to_wait(&(monitor.notificationClient));
917+
918+
while (!dropped)
919+
{
920+
NodeAddressArray nodesArray = { 0 };
921+
922+
bool groupStateHasChanged = false;
923+
int timeoutMs = PG_AUTOCTL_KEEPER_SLEEP_TIME * 1000;
924+
925+
uint64_t now = time(NULL);
926+
927+
if ((now - start) > config->listen_notifications_timeout)
928+
{
929+
log_error("Failed to wait until the node has been dropped");
930+
exit(EXIT_CODE_INTERNAL_ERROR);
931+
}
932+
933+
(void) monitor_wait_for_state_change(&monitor,
934+
config->formation,
935+
groupId,
936+
nodeId,
937+
timeoutMs,
938+
&groupStateHasChanged);
939+
940+
if (!monitor_find_node_by_nodeid(&monitor,
941+
config->formation,
942+
groupId,
943+
nodeId,
944+
&nodesArray))
945+
{
946+
log_error("Failed to query monitor to see if node id %lld "
947+
"has been dropped already",
948+
(long long) nodeId);
949+
exit(EXIT_CODE_MONITOR);
950+
}
951+
952+
dropped = nodesArray.count == 0;
953+
954+
if (dropped)
955+
{
956+
log_info("Node with id %lld in group %d has been successfully "
957+
"dropped from the monitor",
958+
(long long) nodeId, groupId);
959+
}
960+
}
961+
}

src/bin/pg_autoctl/cli_enable_disable.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1714,11 +1714,16 @@ cli_disable_monitor(int argc, char **argv)
17141714
nodesArray.nodes[nodeIndex].host,
17151715
nodesArray.nodes[nodeIndex].port);
17161716

1717+
int64_t nodeId = -1;
1718+
int groupId = -1;
1719+
17171720
if (!monitor_remove_by_hostname(
17181721
monitor,
17191722
nodesArray.nodes[nodeIndex].host,
17201723
nodesArray.nodes[nodeIndex].port,
1721-
optForce))
1724+
optForce,
1725+
&nodeId,
1726+
&groupId))
17221727
{
17231728
/* errors have already been logged */
17241729
exit(EXIT_CODE_MONITOR);

0 commit comments

Comments
 (0)