Skip to content

Commit c1dfd70

Browse files
authored
Refrain from using atexit() to clean-up our logs semaphore. (#811)
* Introduce a semaphore owner (pid_t) to control semaphore clean-up. * Make sure to set returnCode and errno in runprogram when access fails. * Fix pg_autoctl do tmux session single semaphore.
1 parent 84576fd commit c1dfd70

11 files changed

Lines changed: 98 additions & 22 deletions

File tree

src/bin/lib/subcommands.c/runprogram.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,18 @@ execute_subprogram(Program *prog)
183183
int outpipe[2] = { 0, 0 };
184184
int errpipe[2] = { 0, 0 };
185185

186+
/* first level sanity check */
187+
if (access(prog->program, F_OK | X_OK) == -1)
188+
{
189+
fprintf(stderr, "Failed to find executable program at \"%s\": %s\n",
190+
prog->program,
191+
strerror(errno));
192+
193+
prog->returnCode = -1;
194+
prog->error = errno;
195+
return;
196+
}
197+
186198
/* Flush stdio channels just before fork, to avoid double-output problems */
187199
fflush(stdout);
188200
fflush(stderr);
@@ -326,6 +338,18 @@ execute_program(Program *prog)
326338
return;
327339
}
328340

341+
/* first level sanity check */
342+
if (access(prog->program, F_OK | X_OK) == -1)
343+
{
344+
fprintf(stderr, "Failed to find executable program at \"%s\": %s\n",
345+
prog->program,
346+
strerror(errno));
347+
348+
prog->returnCode = -1;
349+
prog->error = errno;
350+
return;
351+
}
352+
329353
if (prog->tty == false)
330354
{
331355
/*

src/bin/pg_autoctl/azure.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,6 @@ azure_start_command(Program *program)
156156
pid_t fpid;
157157
char command[BUFSIZE] = { 0 };
158158

159-
IntString semIdString = intToString(log_semaphore.semId);
160-
161159
(void) snprintf_program_command_line(program, command, sizeof(command));
162160

163161
if (dryRun)
@@ -174,9 +172,6 @@ azure_start_command(Program *program)
174172
fflush(stdout);
175173
fflush(stderr);
176174

177-
/* we want to use the same logs semaphore in the sub-processes */
178-
setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1);
179-
180175
/* time to create the node_active sub-process */
181176
fpid = fork();
182177

src/bin/pg_autoctl/cli_do_tmux.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -882,6 +882,15 @@ tmux_start_server(const char *scriptName, const char *binpath)
882882
char tmux[MAXPGPATH] = { 0 };
883883
char command[BUFSIZE] = { 0 };
884884

885+
/*
886+
* Here we are going to remain the parent process of multiple pg_autoctl
887+
* top-level processes, one per node (and the monitor). We don't want all
888+
* those processes to use the same semaphore for logging, so make sure we
889+
* remove ourselves from the environment before we start all the
890+
* sub-processes.
891+
*/
892+
unsetenv(PG_AUTOCTL_LOG_SEMAPHORE);
893+
885894
if (setenv("PG_AUTOCTL_DEBUG", "1", 1) != 0)
886895
{
887896
log_error("Failed to set environment PG_AUTOCTL_DEBUG: %m");

src/bin/pg_autoctl/demoapp.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -185,8 +185,6 @@ demoapp_run(const char *pguri, DemoAppOptions *demoAppOptions)
185185
int startedClientsCount = 0;
186186
pid_t clientsPidArray[MAX_CLIENTS_COUNT + 1] = { 0 };
187187

188-
IntString semIdString = intToString(log_semaphore.semId);
189-
190188
log_info("Starting %d concurrent clients as sub-processes", clientsCount);
191189

192190

@@ -195,7 +193,6 @@ demoapp_run(const char *pguri, DemoAppOptions *demoAppOptions)
195193
fflush(stderr);
196194

197195
/* we want to use the same logs semaphore in the sub-processes */
198-
setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1);
199196

200197
for (int index = 0; index <= clientsCount; index++)
201198
{

src/bin/pg_autoctl/lock_utils.c

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,44 @@ semaphore_init(Semaphore *semaphore)
5353
}
5454
else
5555
{
56-
return semaphore_create(semaphore);
56+
bool success = semaphore_create(semaphore);
57+
58+
/*
59+
* Only the main process should unlink the semaphore at exit time.
60+
*
61+
* When we create a semaphore, ensure we put our semId in the expected
62+
* environment variable (PG_AUTOCTL_LOG_SEMAPHORE), and we assign the
63+
* current process' pid as the semaphore owner.
64+
*
65+
* When we open a pre-existing semaphore using PG_AUTOCTL_LOG_SEMAPHORE
66+
* as the semId, the semaphore owner is left to zero.
67+
*
68+
* The atexit(3) function that removes the semaphores only acts when
69+
* the owner is our current pid. That way, in case of an early failure
70+
* in execv(), the semaphore is not dropped from under the main
71+
* program.
72+
*
73+
* A typical way execv() would fail is when calling run_program() on a
74+
* pathname that does not exists.
75+
*
76+
* Per atexit(3) manual page:
77+
*
78+
* When a child process is created via fork(2), it inherits copies of
79+
* its parent's registrations. Upon a successful call to one of the
80+
* exec(3) functions, all registrations are removed.
81+
*
82+
* And that's why it's important that we don't remove the semaphore in
83+
* the atexit() cleanup function when a call to run_command() fails
84+
* early.
85+
*/
86+
if (success)
87+
{
88+
IntString semIdString = intToString(semaphore->semId);
89+
90+
setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1);
91+
}
92+
93+
return success;
5794
}
5895
}
5996

@@ -64,15 +101,33 @@ semaphore_init(Semaphore *semaphore)
64101
bool
65102
semaphore_finish(Semaphore *semaphore)
66103
{
67-
if (env_exists(PG_AUTOCTL_LOG_SEMAPHORE))
104+
/*
105+
* At initialization time we either create a new semaphore and register
106+
* getpid() as the owner, or we open a previously existing semaphore from
107+
* its semId as found in our environment variable PG_AUTOCTL_LOG_SEMAPHORE.
108+
*
109+
* At finish time (called from the atexit(3) registry), we remove the
110+
* semaphore only when we are the owner of it. We expect semaphore->owner
111+
* to be either zero (0), or to have been filled with our own pid.
112+
*/
113+
if (semaphore->owner == 0)
68114
{
69115
/* there's no semaphore closing protocol in SysV */
70116
return true;
71117
}
72-
else
118+
else if (semaphore->owner == getpid())
73119
{
74120
return semaphore_unlink(semaphore);
75121
}
122+
else
123+
{
124+
log_fatal("BUG: semaphore_finish semId %d owner is %d, getpid is %d",
125+
semaphore->semId,
126+
semaphore->owner,
127+
getpid());
128+
129+
return false;
130+
}
76131
}
77132

78133

@@ -84,6 +139,7 @@ semaphore_create(Semaphore *semaphore)
84139
{
85140
union semun semun;
86141

142+
semaphore->owner = getpid();
87143
semaphore->semId = semget(IPC_PRIVATE, 1, 0600);
88144

89145
if (semaphore->semId < 0)
@@ -122,7 +178,10 @@ semaphore_create(Semaphore *semaphore)
122178
bool
123179
semaphore_open(Semaphore *semaphore)
124180
{
125-
char semIdString[BUFSIZE];
181+
char semIdString[BUFSIZE] = { 0 };
182+
183+
/* ensure the owner is set to zero when we re-open an existing semaphore */
184+
semaphore->owner = 0;
126185

127186
if (!get_env_copy(PG_AUTOCTL_LOG_SEMAPHORE, semIdString, BUFSIZE))
128187
{

src/bin/pg_autoctl/lock_utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
typedef struct Semaphore
2020
{
2121
int semId;
22+
pid_t owner;
2223
} Semaphore;
2324

2425

src/bin/pg_autoctl/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ char *ps_buffer; /* will point to argv area */
3030
size_t ps_buffer_size; /* space determined at run time */
3131
size_t last_status_len; /* use to minimize length of clobber */
3232

33-
Semaphore log_semaphore; /* allows inter-process locking */
33+
Semaphore log_semaphore = { 0 }; /* allows inter-process locking */
3434

3535

3636
static void set_logger(void);

src/bin/pg_autoctl/service_keeper.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -164,10 +164,8 @@ service_keeper_runprogram(Keeper *keeper)
164164
* --pgdata was parsed from the main command line to start our sub-process.
165165
*/
166166
char *pgdata = keeperOptions.pgSetup.pgdata;
167-
IntString semIdString = intToString(log_semaphore.semId);
168167

169168
setenv(PG_AUTOCTL_DEBUG, "1", 1);
170-
setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1);
171169

172170
args[argsIndex++] = (char *) pg_autoctl_program;
173171
args[argsIndex++] = "do";

src/bin/pg_autoctl/service_monitor.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,10 +156,8 @@ service_monitor_runprogram(Monitor *monitor)
156156
IS_EMPTY_STRING_BUFFER(monitorOptions.pgSetup.pgdata)
157157
? keeperOptions.pgSetup.pgdata
158158
: monitorOptions.pgSetup.pgdata;
159-
IntString semIdString = intToString(log_semaphore.semId);
160159

161160
setenv(PG_AUTOCTL_DEBUG, "1", 1);
162-
setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1);
163161

164162
args[argsIndex++] = (char *) pg_autoctl_program;
165163
args[argsIndex++] = "do";

src/bin/pg_autoctl/service_monitor_init.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,13 @@ service_monitor_init_start(void *context, pid_t *pid)
126126
* then quit, avoiding to call the atexit() semaphore clean-up
127127
* function.
128128
*/
129-
IntString semIdString = intToString(log_semaphore.semId);
130129

131130
const char *serviceName = createAndRun ?
132131
"pg_autoctl: monitor listener" :
133132
"pg_autoctl: monitor installer";
134133

135134
(void) set_ps_title(serviceName);
136135

137-
setenv(PG_AUTOCTL_LOG_SEMAPHORE, semIdString.strValue, 1);
138-
139136
/* finish the install if necessary */
140137
if (!monitor_install(config->hostname, *pgSetup, false))
141138
{

0 commit comments

Comments
 (0)