root/orte/tools/orte-clean/orte-clean.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. main
  2. parse_args
  3. orte_getline
  4. kill_procs

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2005 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2005 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2007-2008 Sun Microsystems, Inc.  All rights reserved.
  14  * Copyright (c) 2007-2016 Los Alamos National Security, LLC.  All rights
  15  *                         reserved.
  16  * Copyright (c) 2011-2013 Cisco Systems, Inc.  All rights reserved.
  17  * Copyright (c) 2015      Research Organization for Information Science
  18  *                         and Technology (RIST). All rights reserved.
  19  * Copyright (c) 2015-2018 Intel, Inc.  All rights reserved.
  20  * Copyright (c) 2017      UT-Battelle, LLC. All rights reserved.
  21  * $COPYRIGHT$
  22  *
  23  * Additional copyrights may follow
  24  *
  25  * $HEADER$
  26  */
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <stdio.h>
  31 #include <errno.h>
  32 #ifdef HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif  /* HAVE_UNISTD_H */
  35 #include <stdlib.h>
  36 #ifdef HAVE_SYS_STAT_H
  37 #include <sys/stat.h>
  38 #endif  /* HAVE_SYS_STAT_H */
  39 #ifdef HAVE_SYS_TYPES_H
  40 #include <sys/types.h>
  41 #endif  /* HAVE_SYS_TYPES_H */
  42 #ifdef HAVE_SYS_WAIT_H
  43 #include <sys/wait.h>
  44 #endif  /* HAVE_SYS_WAIT_H */
  45 #ifdef HAVE_SYS_PARAM_H
  46 #include <sys/param.h>
  47 #endif  /* HAVE_SYS_PARAM_H */
  48 #include <string.h>
  49 #ifdef HAVE_DIRENT_H
  50 #include <dirent.h>
  51 #endif  /* HAVE_DIRENT_H */
  52 #include <signal.h>
  53 #ifdef HAVE_PWD_H
  54 #include <pwd.h>
  55 #endif  /* HAVE_PWD_H */
  56 
  57 #include "opal/util/cmd_line.h"
  58 #include "opal/util/opal_environ.h"
  59 #include "opal/util/os_dirpath.h"
  60 #include "opal/util/basename.h"
  61 #include "opal/util/error.h"
  62 #include "opal/util/printf.h"
  63 #include "opal/mca/base/base.h"
  64 #include "opal/util/show_help.h"
  65 
  66 #include "orte/util/proc_info.h"
  67 #include "orte/util/show_help.h"
  68 
  69 #include "opal/runtime/opal.h"
  70 #if OPAL_ENABLE_FT_CR == 1
  71 #include "opal/runtime/opal_cr.h"
  72 #endif
  73 #include "orte/runtime/runtime.h"
  74 
  75 /******************
  76  * Local Functions
  77  ******************/
  78 static int parse_args(int argc, char *argv[]);
  79 static void kill_procs(void);
  80 
  81 /*****************************************
  82  * Global Vars for Command line Arguments
  83  *****************************************/
  84 typedef struct {
  85     bool help;
  86     bool verbose;
  87     bool debug;
  88 } orte_clean_globals_t;
  89 
  90 orte_clean_globals_t orte_clean_globals = {0};
  91 
  92 opal_cmd_line_init_t cmd_line_opts[] = {
  93     { NULL,
  94       'h', NULL, "help",
  95       0,
  96       &orte_clean_globals.help, OPAL_CMD_LINE_TYPE_BOOL,
  97       "This help message" },
  98 
  99     { NULL,
 100       'v', NULL, "verbose",
 101       0,
 102       &orte_clean_globals.verbose, OPAL_CMD_LINE_TYPE_BOOL,
 103       "Generate verbose output" },
 104 
 105     { NULL,
 106       'd', NULL, "debug",
 107       0,
 108       &orte_clean_globals.debug, OPAL_CMD_LINE_TYPE_BOOL,
 109       "Extra debug output for developers to ensure that orte-clean is working" },
 110 
 111     /* End of list */
 112     { NULL,
 113       '\0', NULL, NULL,
 114       0,
 115       NULL, OPAL_CMD_LINE_TYPE_NULL,
 116       NULL }
 117 };
 118 
 119 /*
 120  * This utility will do a brute force clean of a node.  It will
 121  * attempt to clean up any files in the user's session directory.
 122  * It will also look for any orted and orterun processes that are
 123  * not part of this job, and kill them off.
 124 */
 125 int
 126 main(int argc, char *argv[])
 127 {
 128     int ret = ORTE_SUCCESS;
 129 #if OPAL_ENABLE_FT_CR == 1
 130     char *tmp_env_var;
 131 #endif
 132     char *legacy;
 133 
 134     /* This is needed so we can print the help message */
 135     if (ORTE_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
 136         return ret;
 137     }
 138 
 139     if (ORTE_SUCCESS != (ret = parse_args(argc, argv))) {
 140         return ret;
 141     }
 142 
 143 #if OPAL_ENABLE_FT_CR == 1
 144     /* Disable the checkpoint notification routine for this
 145      * tool. As we will never need to checkpoint this tool.
 146      * Note: This must happen before opal_init().
 147      */
 148     opal_cr_set_enabled(false);
 149 
 150     /* Select the none component, since we don't actually use a checkpointer */
 151     (void) mca_base_var_env_name("crs", &tmp_env_var);
 152     opal_setenv(tmp_env_var,
 153                 "none",
 154                 true, &environ);
 155     free(tmp_env_var);
 156     tmp_env_var = NULL;
 157 
 158     (void) mca_base_var_env_name("opal_cr_is_tool", &tmp_env_var);
 159     opal_setenv(tmp_env_var,
 160                 "1", true, NULL);
 161     free(tmp_env_var);
 162 #endif
 163 
 164     if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_TOOL))) {
 165         return ret;
 166     }
 167 
 168     /*
 169      * Clean out all session directories - we don't have to protect
 170      * our own session directory because (since we are a tool) we
 171      * didn't create one!
 172      */
 173     if (orte_clean_globals.verbose) {
 174         fprintf(stderr, "orte-clean: cleaning session dir tree %s\n",
 175                 orte_process_info.top_session_dir);
 176     }
 177     opal_os_dirpath_destroy(orte_process_info.top_session_dir, true, NULL);
 178 
 179     /* also get rid of any legacy session directories */
 180     opal_asprintf(&legacy, "%s/openmpi-sessions-%d@%s_0",
 181              orte_process_info.tmpdir_base,
 182              (int)geteuid(), orte_process_info.nodename);
 183     opal_os_dirpath_destroy(legacy, true, NULL);
 184     free(legacy);
 185 
 186     /* and finally get rid of any lingering pmix-related artifacts */
 187     opal_asprintf(&legacy, "rm -rf %s/pmix*", orte_process_info.tmpdir_base);
 188     system(legacy);
 189     free(legacy);
 190 
 191     /* now kill any lingering procs, if we can */
 192     kill_procs();
 193 
 194     orte_finalize();
 195 
 196     return ORTE_SUCCESS;
 197 }
 198 /*
 199  * Parse the command line arguments using the functions command
 200  * line utility functions.
 201  */
 202 static int parse_args(int argc, char *argv[]) {
 203     int ret;
 204     opal_cmd_line_t cmd_line;
 205     orte_clean_globals_t tmp = { false, false, false };
 206 
 207     /* NOTE: There is a bug in the PGI 6.2 series that causes the
 208        compiler to choke when copying structs containing bool members
 209        by value.  So do a memcpy here instead. */
 210     memcpy(&orte_clean_globals, &tmp, sizeof(tmp));
 211 
 212     /*
 213      * Initialize list of available command line options.
 214      */
 215     opal_cmd_line_create(&cmd_line, cmd_line_opts);
 216     ret = opal_cmd_line_parse(&cmd_line, false, false, argc, argv);
 217 
 218     if (OPAL_SUCCESS != ret) {
 219         if (OPAL_ERR_SILENT != ret) {
 220             fprintf(stderr, "%s: command line error (%s)\n", argv[0],
 221                     opal_strerror(ret));
 222         }
 223         return ret;
 224     }
 225 
 226     /**
 227      * Now start parsing our specific arguments
 228      */
 229     if (orte_clean_globals.help) {
 230         char *str, *args = NULL;
 231         args = opal_cmd_line_get_usage_msg(&cmd_line);
 232         str = opal_show_help_string("help-orte-clean.txt", "usage", true,
 233                                     args);
 234         if (NULL != str) {
 235             printf("%s", str);
 236             free(str);
 237         }
 238         free(args);
 239         /* If we show the help message, that should be all we do */
 240         exit(0);
 241     }
 242 
 243     OBJ_DESTRUCT(&cmd_line);
 244 
 245     return ORTE_SUCCESS;
 246 }
 247 
 248 static char *orte_getline(FILE *fp)
 249 {
 250     char *ret, *buff;
 251     char input[1024];
 252     int i;
 253 
 254     ret = fgets(input, 1024, fp);
 255     if (NULL != ret) {
 256         /* remove trailing spaces */
 257         for (i=strlen(input)-2; i > 0; i--) {
 258             if (input[i] != ' ') {
 259                 input[i+1] = '\0';
 260                 break;
 261             }
 262         }
 263         buff = strdup(input);
 264         return buff;
 265     }
 266 
 267     return NULL;
 268 }
 269 
 270 /*
 271  * This function makes a call to "ps" to find out the processes that
 272  * are running on this node.  It then attempts to kill off any orteds
 273  * and orteruns that are not related to this job.
 274  */
 275 static
 276 void kill_procs(void) {
 277     int ortedpid;
 278     char *fullprocname;
 279     char *procname;
 280     char *pidstr;
 281     char *user;
 282     int procpid;
 283     FILE *psfile;
 284     char *inputline;
 285     char *this_user;
 286     int uid;
 287     char *separator = " \t";  /* output can be delimited by space or tab */
 288 
 289     /*
 290      * This is the command that is used to get the information about
 291      * all the processes that are running.  The output looks like the
 292      * following:
 293      * COMMAND    PID     UID
 294      * tcsh     12556    1000
 295      * ps       14424    1000
 296      * etc.
 297      */
 298 
 299     /*
 300      * The configure determines if there is a valid ps command for us to
 301      * use.  If it is set to unknown, then we skip this section.
 302      */
 303     char command[] = ORTE_CLEAN_PS_CMD;
 304     if (0 == strcmp("unknown", command)) {
 305         return;
 306     }
 307 
 308     if (orte_clean_globals.verbose) {
 309         fprintf(stderr, "orte-clean: killing any lingering procs\n");
 310     }
 311 
 312     /*
 313      * Get our parent pid which is the pid of the orted.
 314      */
 315     ortedpid = getppid();
 316 
 317     /* get the userid of the user */
 318     uid = getuid();
 319     opal_asprintf(&this_user, "%d", uid);
 320 
 321     /*
 322      * There is a race condition here.  The problem is that we are looking
 323      * for any processes named orted.  However, one may erroneously find more
 324      * orteds then there really are because the orted is doing a series of
 325      * fork/execs. If we run with more than one orte-clean on a node, then
 326      * one of the orte-cleans may catch the other one while it has forked,
 327      * but not exec'ed.  It will therefore kill an orte-clean.  Now one
 328      * can argue it is silly to run more than one orte-clean on a node, and
 329      * this is true.  We will have to figure out how to prevent this.  For
 330      * now, we use a big hammer and just sleep a second to decrease the
 331      * probability.
 332      */
 333     sleep(1);
 334 
 335     psfile = popen(command, "r");
 336     /*
 337      * Read the first line of the output.  We just throw it away
 338      * as it is the header consisting of the words COMMAND, PID and UID.
 339      */
 340     if (NULL == (inputline = orte_getline(psfile))) {
 341         free(this_user);
 342         pclose(psfile);
 343         return;
 344     }
 345     free(inputline);  /* dump the header line */
 346 
 347     while (NULL != (inputline = orte_getline(psfile))) {
 348 
 349         /* The three fields are typically seperated by spaces */
 350         fullprocname = strtok(inputline, separator);
 351         pidstr = strtok(NULL, separator);
 352         user = strtok(NULL, separator);
 353 
 354         if (orte_clean_globals.debug) {
 355             fprintf(stdout, "\norte-clean: user(pid)=%s, me=%s\n",
 356                     user, this_user);
 357         }
 358 
 359         /* If the user is not us, and the user is not root, then skip
 360          * further checking.  If the user is root, then continue on as
 361          * we want root to kill off everybody. */
 362         if ((0 != strcmp(user, this_user)) && (0 != strcmp("0", this_user))) {
 363             /* not us */
 364             free(inputline);
 365             continue;
 366         }
 367 
 368         procpid = atoi(pidstr);
 369         procname = opal_basename(fullprocname);
 370         if (orte_clean_globals.debug) {
 371             fprintf(stdout, "orte-clean: fullname=%s, basename=%s, pid=%d\n",
 372                     fullprocname, procname, procpid);
 373         }
 374 
 375         /*
 376          * Look for any orteds that are not our parent and attempt to
 377          * kill them.  We currently do not worry whether we are the
 378          * owner or not.  If we are not, we will just fail to send
 379          * the signal and that is OK.  This also allows a root process
 380          * to kill all orteds.
 381          *
 382          * NOTE: need to also look for "(orted)" as a non-active
 383          * proc is sometimes reported that way
 384          */
 385         if (0 == strncmp("orted", procname, strlen("orted")) ||
 386             0 == strncmp("(orted)", procname, strlen("(orted)")) ||
 387             0 == strncmp("orte-dvm", procname, strlen("orte-dvm")) ||
 388             0 == strncmp("(orte-dvm)", procname, strlen("(orte-dvm)"))) {
 389             if (procpid != ortedpid) {
 390                 if (orte_clean_globals.verbose) {
 391                     fprintf(stderr, "orte-clean: found potential rogue orted process"
 392                             " (pid=%d,uid=%s), sending SIGKILL...\n",
 393                             procpid, user);
 394                 }
 395                 /*
 396                  * We ignore the return code here as we do not really
 397                  * care whether this worked or not.
 398                  */
 399                 (void)kill(procpid, SIGKILL);
 400             }
 401         }
 402 
 403         /*
 404          * Now check for any orteruns.
 405          */
 406         if (0 == strncmp("orterun", procname, strlen("orterun")) ||
 407             0 == strncmp("mpirun", procname, strlen("mpirun"))) {
 408             /* if we are on the same node as the HNP, then the ortedpid
 409              * is the same as that of our orterun, so don't kill it
 410              */
 411             if (procpid != ortedpid) {
 412                 if (orte_clean_globals.verbose) {
 413                     fprintf(stderr, "orte-clean: found potential rogue orterun process"
 414                             " (pid=%d,uid=%s), sending SIGKILL...\n",
 415                             procpid, user);
 416 
 417                 }
 418                 /* if we are a singleton, check the hnp_pid as well */
 419                 if (ORTE_PROC_IS_SINGLETON) {
 420                     if (procpid != orte_process_info.hnp_pid) {
 421                         (void)kill(procpid, SIGKILL);
 422                     }
 423                 } else {
 424                     /* We ignore the return code here as we do not really
 425                      * care whether this worked or not.
 426                      */
 427                     (void)kill(procpid, SIGKILL);
 428                 }
 429             }
 430         }
 431         free(inputline);
 432         free(procname);
 433     }
 434     free(this_user);
 435     pclose(psfile);
 436     return;
 437 }

/* [<][>][^][v][top][bottom][index][help] */