1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2005 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2007-2008 Sun Microsystems, Inc. All rights reserved.
14 * Copyright (c) 2007-2016 Los Alamos National Security, LLC. All rights
15 * reserved.
16 * Copyright (c) 2011-2013 Cisco Systems, Inc. All rights reserved.
17 * Copyright (c) 2015 Research Organization for Information Science
18 * and Technology (RIST). All rights reserved.
19 * Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
20 * Copyright (c) 2017 UT-Battelle, LLC. All rights reserved.
21 * $COPYRIGHT$
22 *
23 * Additional copyrights may follow
24 *
25 * $HEADER$
26 */
27 #include "orte_config.h"
28 #include "orte/constants.h"
29
30 #include <stdio.h>
31 #include <errno.h>
32 #ifdef HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif /* HAVE_UNISTD_H */
35 #include <stdlib.h>
36 #ifdef HAVE_SYS_STAT_H
37 #include <sys/stat.h>
38 #endif /* HAVE_SYS_STAT_H */
39 #ifdef HAVE_SYS_TYPES_H
40 #include <sys/types.h>
41 #endif /* HAVE_SYS_TYPES_H */
42 #ifdef HAVE_SYS_WAIT_H
43 #include <sys/wait.h>
44 #endif /* HAVE_SYS_WAIT_H */
45 #ifdef HAVE_SYS_PARAM_H
46 #include <sys/param.h>
47 #endif /* HAVE_SYS_PARAM_H */
48 #include <string.h>
49 #ifdef HAVE_DIRENT_H
50 #include <dirent.h>
51 #endif /* HAVE_DIRENT_H */
52 #include <signal.h>
53 #ifdef HAVE_PWD_H
54 #include <pwd.h>
55 #endif /* HAVE_PWD_H */
56
57 #include "opal/util/cmd_line.h"
58 #include "opal/util/opal_environ.h"
59 #include "opal/util/os_dirpath.h"
60 #include "opal/util/basename.h"
61 #include "opal/util/error.h"
62 #include "opal/util/printf.h"
63 #include "opal/mca/base/base.h"
64 #include "opal/util/show_help.h"
65
66 #include "orte/util/proc_info.h"
67 #include "orte/util/show_help.h"
68
69 #include "opal/runtime/opal.h"
70 #if OPAL_ENABLE_FT_CR == 1
71 #include "opal/runtime/opal_cr.h"
72 #endif
73 #include "orte/runtime/runtime.h"
74
75 /******************
76 * Local Functions
77 ******************/
78 static int parse_args(int argc, char *argv[]);
79 static void kill_procs(void);
80
81 /*****************************************
82 * Global Vars for Command line Arguments
83 *****************************************/
84 typedef struct {
85 bool help;
86 bool verbose;
87 bool debug;
88 } orte_clean_globals_t;
89
90 orte_clean_globals_t orte_clean_globals = {0};
91
92 opal_cmd_line_init_t cmd_line_opts[] = {
93 { NULL,
94 'h', NULL, "help",
95 0,
96 &orte_clean_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
97 "This help message" },
98
99 { NULL,
100 'v', NULL, "verbose",
101 0,
102 &orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
103 "Generate verbose output" },
104
105 { NULL,
106 'd', NULL, "debug",
107 0,
108 &orte_clean_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
109 "Extra debug output for developers to ensure that orte-clean is working" },
110
111 /* End of list */
112 { NULL,
113 '\0', NULL, NULL,
114 0,
115 NULL, OPAL_CMD_LINE_TYPE_NULL,
116 NULL }
117 };
118
119 /*
120 * This utility will do a brute force clean of a node. It will
121 * attempt to clean up any files in the user's session directory.
122 * It will also look for any orted and orterun processes that are
123 * not part of this job, and kill them off.
124 */
125 int
126 main(int argc, char *argv[])
127 {
128 int ret = ORTE_SUCCESS;
129 #if OPAL_ENABLE_FT_CR == 1
130 char *tmp_env_var;
131 #endif
132 char *legacy;
133
134 /* This is needed so we can print the help message */
135 if (ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
136 return ret;
137 }
138
139 if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
140 return ret;
141 }
142
143 #if OPAL_ENABLE_FT_CR == 1
144 /* Disable the checkpoint notification routine for this
145 * tool. As we will never need to checkpoint this tool.
146 * Note: This must happen before opal_init().
147 */
148 opal_cr_set_enabled(false);
149
150 /* Select the none component, since we don't actually use a checkpointer */
151 (void) mca_base_var_env_name("crs", &tmp_env_var);
152 opal_setenv(tmp_env_var,
153 "none",
154 true, &environ);
155 free(tmp_env_var);
156 tmp_env_var = NULL;
157
158 (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
159 opal_setenv(tmp_env_var,
160 "1", true, NULL);
161 free(tmp_env_var);
162 #endif
163
164 if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
165 return ret;
166 }
167
168 /*
169 * Clean out all session directories - we don't have to protect
170 * our own session directory because (since we are a tool) we
171 * didn't create one!
172 */
173 if (orte_clean_globals.verbose) {
174 fprintf(stderr, "orte-clean: cleaning session dir tree %s\n",
175 orte_process_info.top_session_dir);
176 }
177 opal_os_dirpath_destroy(orte_process_info.top_session_dir, true, NULL);
178
179 /* also get rid of any legacy session directories */
180 opal_asprintf(&legacy, "%s/openmpi-sessions-%d@%s_0",
181 orte_process_info.tmpdir_base,
182 (int)geteuid(), orte_process_info.nodename);
183 opal_os_dirpath_destroy(legacy, true, NULL);
184 free(legacy);
185
186 /* and finally get rid of any lingering pmix-related artifacts */
187 opal_asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base);
188 system(legacy);
189 free(legacy);
190
191 /* now kill any lingering procs, if we can */
192 kill_procs();
193
194 orte_finalize();
195
196 return ORTE_SUCCESS;
197 }
198 /*
199 * Parse the command line arguments using the functions command
200 * line utility functions.
201 */
202 static int parse_args(int argc, char *argv[]) {
203 int ret;
204 opal_cmd_line_t cmd_line;
205 orte_clean_globals_t tmp = { false, false, false };
206
207 /* NOTE: There is a bug in the PGI 6.2 series that causes the
208 compiler to choke when copying structs containing bool members
209 by value. So do a memcpy here instead. */
210 memcpy(&orte_clean_globals, &tmp, sizeof(tmp));
211
212 /*
213 * Initialize list of available command line options.
214 */
215 opal_cmd_line_create(&cmd_line, cmd_line_opts);
216 ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
217
218 if (OPAL_SUCCESS != ret) {
219 if (OPAL_ERR_SILENT != ret) {
220 fprintf(stderr, "%s: command line error (%s)\n", argv[0],
221 opal_strerror(ret));
222 }
223 return ret;
224 }
225
226 /**
227 * Now start parsing our specific arguments
228 */
229 if (orte_clean_globals.help) {
230 char *str, *args = NULL;
231 args = opal_cmd_line_get_usage_msg(&cmd_line);
232 str = opal_show_help_string("help-orte-clean.txt", "usage", true,
233 args);
234 if (NULL != str) {
235 printf("%s", str);
236 free(str);
237 }
238 free(args);
239 /* If we show the help message, that should be all we do */
240 exit(0);
241 }
242
243 OBJ_DESTRUCT(&cmd_line);
244
245 return ORTE_SUCCESS;
246 }
247
248 static char *orte_getline(FILE *fp)
249 {
250 char *ret, *buff;
251 char input[1024];
252 int i;
253
254 ret = fgets(input, 1024, fp);
255 if (NULL != ret) {
256 /* remove trailing spaces */
257 for (i=strlen(input)-2; i > 0; i--) {
258 if (input[i] != ' ') {
259 input[i+1] = '\0';
260 break;
261 }
262 }
263 buff = strdup(input);
264 return buff;
265 }
266
267 return NULL;
268 }
269
270 /*
271 * This function makes a call to "ps" to find out the processes that
272 * are running on this node. It then attempts to kill off any orteds
273 * and orteruns that are not related to this job.
274 */
275 static
276 void kill_procs(void) {
277 int ortedpid;
278 char *fullprocname;
279 char *procname;
280 char *pidstr;
281 char *user;
282 int procpid;
283 FILE *psfile;
284 char *inputline;
285 char *this_user;
286 int uid;
287 char *separator = " \t"; /* output can be delimited by space or tab */
288
289 /*
290 * This is the command that is used to get the information about
291 * all the processes that are running. The output looks like the
292 * following:
293 * COMMAND PID UID
294 * tcsh 12556 1000
295 * ps 14424 1000
296 * etc.
297 */
298
299 /*
300 * The configure determines if there is a valid ps command for us to
301 * use. If it is set to unknown, then we skip this section.
302 */
303 char command[] = ORTE_CLEAN_PS_CMD;
304 if (0 == strcmp("unknown", command)) {
305 return;
306 }
307
308 if (orte_clean_globals.verbose) {
309 fprintf(stderr, "orte-clean: killing any lingering procs\n");
310 }
311
312 /*
313 * Get our parent pid which is the pid of the orted.
314 */
315 ortedpid = getppid();
316
317 /* get the userid of the user */
318 uid = getuid();
319 opal_asprintf(&this_user, "%d", uid);
320
321 /*
322 * There is a race condition here. The problem is that we are looking
323 * for any processes named orted. However, one may erroneously find more
324 * orteds then there really are because the orted is doing a series of
325 * fork/execs. If we run with more than one orte-clean on a node, then
326 * one of the orte-cleans may catch the other one while it has forked,
327 * but not exec'ed. It will therefore kill an orte-clean. Now one
328 * can argue it is silly to run more than one orte-clean on a node, and
329 * this is true. We will have to figure out how to prevent this. For
330 * now, we use a big hammer and just sleep a second to decrease the
331 * probability.
332 */
333 sleep(1);
334
335 psfile = popen(command, "r");
336 /*
337 * Read the first line of the output. We just throw it away
338 * as it is the header consisting of the words COMMAND, PID and UID.
339 */
340 if (NULL == (inputline = orte_getline(psfile))) {
341 free(this_user);
342 pclose(psfile);
343 return;
344 }
345 free(inputline); /* dump the header line */
346
347 while (NULL != (inputline = orte_getline(psfile))) {
348
349 /* The three fields are typically seperated by spaces */
350 fullprocname = strtok(inputline, separator);
351 pidstr = strtok(NULL, separator);
352 user = strtok(NULL, separator);
353
354 if (orte_clean_globals.debug) {
355 fprintf(stdout, "\norte-clean: user(pid)=%s, me=%s\n",
356 user, this_user);
357 }
358
359 /* If the user is not us, and the user is not root, then skip
360 * further checking. If the user is root, then continue on as
361 * we want root to kill off everybody. */
362 if ((0 != strcmp(user, this_user)) && (0 != strcmp("0", this_user))) {
363 /* not us */
364 free(inputline);
365 continue;
366 }
367
368 procpid = atoi(pidstr);
369 procname = opal_basename(fullprocname);
370 if (orte_clean_globals.debug) {
371 fprintf(stdout, "orte-clean: fullname=%s, basename=%s, pid=%d\n",
372 fullprocname, procname, procpid);
373 }
374
375 /*
376 * Look for any orteds that are not our parent and attempt to
377 * kill them. We currently do not worry whether we are the
378 * owner or not. If we are not, we will just fail to send
379 * the signal and that is OK. This also allows a root process
380 * to kill all orteds.
381 *
382 * NOTE: need to also look for "(orted)" as a non-active
383 * proc is sometimes reported that way
384 */
385 if (0 == strncmp("orted", procname, strlen("orted")) ||
386 0 == strncmp("(orted)", procname, strlen("(orted)")) ||
387 0 == strncmp("orte-dvm", procname, strlen("orte-dvm")) ||
388 0 == strncmp("(orte-dvm)", procname, strlen("(orte-dvm)"))) {
389 if (procpid != ortedpid) {
390 if (orte_clean_globals.verbose) {
391 fprintf(stderr, "orte-clean: found potential rogue orted process"
392 " (pid=%d,uid=%s), sending SIGKILL...\n",
393 procpid, user);
394 }
395 /*
396 * We ignore the return code here as we do not really
397 * care whether this worked or not.
398 */
399 (void)kill(procpid, SIGKILL);
400 }
401 }
402
403 /*
404 * Now check for any orteruns.
405 */
406 if (0 == strncmp("orterun", procname, strlen("orterun")) ||
407 0 == strncmp("mpirun", procname, strlen("mpirun"))) {
408 /* if we are on the same node as the HNP, then the ortedpid
409 * is the same as that of our orterun, so don't kill it
410 */
411 if (procpid != ortedpid) {
412 if (orte_clean_globals.verbose) {
413 fprintf(stderr, "orte-clean: found potential rogue orterun process"
414 " (pid=%d,uid=%s), sending SIGKILL...\n",
415 procpid, user);
416
417 }
418 /* if we are a singleton, check the hnp_pid as well */
419 if (ORTE_PROC_IS_SINGLETON) {
420 if (procpid != orte_process_info.hnp_pid) {
421 (void)kill(procpid, SIGKILL);
422 }
423 } else {
424 /* We ignore the return code here as we do not really
425 * care whether this worked or not.
426 */
427 (void)kill(procpid, SIGKILL);
428 }
429 }
430 }
431 free(inputline);
432 free(procname);
433 }
434 free(this_user);
435 pclose(psfile);
436 return;
437 }