Skip to content

Commit f665321

Browse files
authored
Have drop node --force --destroy work as expected again (#743)
The following command would get stuck for 30 seconds when the pg_autoctl service was still running. ``` pg_autoctl drop node --pgdata node1 --force --destroy ```
1 parent 5304068 commit f665321

File tree

4 files changed

+110
-43
lines changed

4 files changed

+110
-43
lines changed

src/bin/pg_autoctl/cli_drop_node.c

Lines changed: 48 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -607,9 +607,47 @@ cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy)
607607
*/
608608
pid_t pid = 0;
609609

610-
if (!wait_for_pid_to_exit(config->pathnames.pid, 30, &pid))
610+
/*
611+
* Before continuing we need to make sure that a currently running service
612+
* has stopped.
613+
*/
614+
bool stopped;
615+
if (dropForce)
616+
{
617+
/*
618+
* If --force is used, we skip the transition to "dropped". So a
619+
* currently running process won't realise it's dropped, which means it
620+
* will not exit by itself. Thus all we need to know is if it's running
621+
* now or not.
622+
*/
623+
if (!is_process_stopped(config->pathnames.pid, &stopped, &pid))
624+
{
625+
/* errors have already been logged */
626+
exit(EXIT_CODE_INTERNAL_ERROR);
627+
}
628+
}
629+
else
630+
{
631+
/*
632+
* If --force isn't used then a running pg_autoctl process will detect
633+
* that it is dropped and clean itself up nicely and finally it will
634+
* exit. We give the process 30 seconds to exit by itself.
635+
*/
636+
if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid))
637+
{
638+
/* errors have already been logged */
639+
exit(EXIT_CODE_INTERNAL_ERROR);
640+
}
641+
}
642+
643+
/*
644+
* If the service is not stopped yet, we just want to process to exit
645+
* so we can take over. This can happen either because --force was used
646+
* or because 30 seconds was not enough time for the service to exit.
647+
*/
648+
if (!stopped)
611649
{
612-
/* if the service isn't terminated in 30s, signal it to quit now */
650+
/* if the service isn't terminated, signal it to quit now */
613651
log_info("Sending signal %s to pg_autoctl process %d",
614652
signal_to_string(SIGQUIT),
615653
pid);
@@ -620,7 +658,8 @@ cli_drop_local_node(KeeperConfig *config, bool dropAndDestroy)
620658
exit(EXIT_CODE_INTERNAL_ERROR);
621659
}
622660

623-
if (!wait_for_pid_to_exit(config->pathnames.pid, 30, &pid))
661+
if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid) ||
662+
!stopped)
624663
{
625664
log_fatal("Failed to stop the pg_autoctl process with pid %d", pid);
626665
exit(EXIT_CODE_INTERNAL_ERROR);
@@ -712,7 +751,9 @@ cli_drop_node_with_monitor_disabled(KeeperConfig *config, bool dropAndDestroy)
712751
exit(EXIT_CODE_INTERNAL_ERROR);
713752
}
714753

715-
if (!wait_for_pid_to_exit(config->pathnames.pid, 30, &pid))
754+
bool stopped;
755+
if (!wait_for_process_to_stop(config->pathnames.pid, 30, &stopped, &pid) ||
756+
!stopped)
716757
{
717758
log_fatal(
718759
"Failed to stop the pg_autoctl process with pid %d",
@@ -779,7 +820,9 @@ cli_drop_local_monitor(MonitorConfig *mconfig, bool dropAndDestroy)
779820
exit(EXIT_CODE_INTERNAL_ERROR);
780821
}
781822

782-
if (!wait_for_pid_to_exit(mconfig->pathnames.pid, 30, &pid))
823+
bool stopped;
824+
if (!wait_for_process_to_stop(mconfig->pathnames.pid, 30, &stopped, &pid) ||
825+
!stopped)
783826
{
784827
log_fatal("Failed to stop the pg_autoctl process with pid %d", pid);
785828
exit(EXIT_CODE_INTERNAL_ERROR);

src/bin/pg_autoctl/pidfile.c

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -499,47 +499,59 @@ pidfile_as_json(JSON_Value *js, const char *pidfile, bool includeStatus)
499499
}
500500

501501

502+
bool
503+
is_process_stopped(const char *pidfile, bool *stopped, pid_t *pid)
504+
{
505+
if (!file_exists(pidfile))
506+
{
507+
*stopped = true;
508+
return true;
509+
}
510+
511+
if (!read_pidfile(pidfile, pid))
512+
{
513+
log_error("Failed to read PID file \"%s\"", pidfile);
514+
return false;
515+
}
516+
517+
*stopped = false;
518+
return true;
519+
}
520+
521+
502522
/*
503-
* wait_for_pid_to_exit waits until the PID found in the pidfile is not running
523+
* wait_for_process_to_stop waits until the PID found in the pidfile is not running
504524
* anymore.
505525
*/
506526
bool
507-
wait_for_pid_to_exit(const char *pidfile, int timeout, pid_t *pid)
527+
wait_for_process_to_stop(const char *pidfile, int timeout, bool *stopped, pid_t *pid)
508528
{
509-
if (file_exists(pidfile))
529+
if (!is_process_stopped(pidfile, stopped, pid))
510530
{
511-
if (read_pidfile(pidfile, pid))
512-
{
513-
log_info("An instance of pg_autoctl is running with PID %d, "
514-
"waiting for it to stop.", *pid);
515-
516-
int timeout_counter = timeout;
531+
/* errors have already been logged */
532+
return false;
533+
}
517534

518-
while (timeout_counter > 0)
519-
{
520-
if (kill(*pid, 0) == -1 && errno == ESRCH)
521-
{
522-
log_info("The pg_autoctl instance with pid %d "
523-
"has now terminated.",
524-
*pid);
525-
break;
526-
}
535+
log_info("An instance of pg_autoctl is running with PID %d, "
536+
"waiting for it to stop.", *pid);
527537

528-
sleep(1);
529-
--timeout_counter;
530-
}
538+
int timeout_counter = timeout;
531539

532-
return timeout_counter > 0;
533-
}
534-
else
540+
while (timeout_counter > 0)
541+
{
542+
if (kill(*pid, 0) == -1 && errno == ESRCH)
535543
{
536-
log_error("Failed to read PID file \"%s\"", pidfile);
537-
return false;
544+
log_info("The pg_autoctl instance with pid %d "
545+
"has now terminated.",
546+
*pid);
547+
*stopped = true;
548+
return true;
538549
}
550+
551+
sleep(1);
552+
--timeout_counter;
539553
}
540-
else
541-
{
542-
/* when the pidfile doesn't exist, assume the service is not running */
543-
return true;
544-
}
554+
555+
*stopped = false;
556+
return true;
545557
}

src/bin/pg_autoctl/pidfile.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ void check_pidfile(const char *pidfile, pid_t start_pid);
6767

6868
void pidfile_as_json(JSON_Value *js, const char *pidfile, bool includeStatus);
6969

70-
bool wait_for_pid_to_exit(const char *pidfile, int timeout, pid_t *pid);
70+
bool is_process_stopped(const char *pidfile, bool *stopped, pid_t *pid);
71+
bool wait_for_process_to_stop(const char *pidfile, int timeout, bool *stopped,
72+
pid_t *pid);
7173

7274
#endif /* PIDFILE_H */

tests/pgautofailover_utils.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,12 +181,12 @@ def pg_createcluster(self, datadir, port=5432):
181181

182182
return abspath
183183

184-
def destroy(self):
184+
def destroy(self, force=True):
185185
"""
186186
Cleanup whatever was created for this Cluster.
187187
"""
188188
for datanode in list(reversed(self.datanodes)):
189-
datanode.destroy()
189+
datanode.destroy(force=force, ignore_failure=True, timeout=3)
190190
if self.monitor:
191191
self.monitor.destroy()
192192
self.vlan.destroy()
@@ -1100,23 +1100,33 @@ def get_nodename(self, nodeId=None):
11001100

11011101
return self.name
11021102

1103-
def destroy(self):
1103+
def destroy(
1104+
self, force=False, ignore_failure=False, timeout=COMMAND_TIMEOUT
1105+
):
11041106
"""
11051107
Cleans up processes and files created for this data node.
11061108
"""
1109+
11071110
self.stop_pg_autoctl()
11081111

1112+
flags = ["--destroy"]
1113+
if force:
1114+
flags.append("--force")
1115+
11091116
try:
11101117
destroy = PGAutoCtl(self)
11111118
destroy.execute(
11121119
"pg_autoctl drop node --destroy",
11131120
"drop",
11141121
"node",
1115-
"--destroy",
1116-
timeout=3,
1122+
*flags,
1123+
timeout=timeout,
11171124
)
11181125
except Exception as e:
1119-
print(str(e))
1126+
if ignore_failure:
1127+
print(str(e))
1128+
else:
1129+
raise
11201130

11211131
try:
11221132
os.remove(self.config_file_path())

0 commit comments

Comments
 (0)