This source file includes following definitions.
- odls_default_kill_local
- orte_odls_default_kill_local_procs
- set_handler_default
- write_help_msg
- send_error_show_help
- close_open_file_descriptors
- do_child
- do_parent
- odls_default_fork_local_proc
- orte_odls_default_launch_local_procs
- send_signal
- orte_odls_default_signal_local_procs
- orte_odls_default_restart_proc
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 
  31 
  32 
  33 
  34 
  35 
  36 
  37 
  38 
  39 
  40 
  41 
  42 
  43 
  44 
  45 
  46 
  47 
  48 
  49 
  50 
  51 
  52 
  53 
  54 
  55 
  56 
  57 
  58 
  59 
  60 
  61 
  62 
  63 
  64 
  65 
  66 
  67 
  68 
  69 
  70 
  71 #include "orte_config.h"
  72 #include "orte/constants.h"
  73 #include "orte/types.h"
  74 
  75 #include <string.h>
  76 #include <stdlib.h>
  77 #ifdef HAVE_UNISTD_H
  78 #include <unistd.h>
  79 #endif
  80 #include <errno.h>
  81 #ifdef HAVE_SYS_TYPES_H
  82 #include <sys/types.h>
  83 #endif
  84 #ifdef HAVE_SYS_WAIT_H
  85 #include <sys/wait.h>
  86 #endif
  87 #include <signal.h>
  88 #ifdef HAVE_FCNTL_H
  89 #include <fcntl.h>
  90 #endif
  91 #ifdef HAVE_SYS_TIME_H
  92 #include <sys/time.h>
  93 #endif
  94 #ifdef HAVE_SYS_PARAM_H
  95 #include <sys/param.h>
  96 #endif
  97 #ifdef HAVE_NETDB_H
  98 #include <netdb.h>
  99 #endif
 100 #include <stdlib.h>
 101 #ifdef HAVE_SYS_STAT_H
 102 #include <sys/stat.h>
 103 #endif  
 104 #include <stdarg.h>
 105 #ifdef HAVE_SYS_SELECT_H
 106 #include <sys/select.h>
 107 #endif
 108 #ifdef HAVE_DIRENT_H
 109 #include <dirent.h>
 110 #endif
 111 #include <ctype.h>
 112 
 113 #include "opal/mca/hwloc/hwloc-internal.h"
 114 #include "opal/mca/hwloc/base/base.h"
 115 #include "opal/class/opal_pointer_array.h"
 116 #include "opal/util/opal_environ.h"
 117 #include "opal/util/show_help.h"
 118 #include "opal/util/sys_limits.h"
 119 #include "opal/util/fd.h"
 120 
 121 #include "orte/util/show_help.h"
 122 #include "orte/runtime/orte_wait.h"
 123 #include "orte/runtime/orte_globals.h"
 124 #include "orte/mca/errmgr/errmgr.h"
 125 #include "orte/mca/ess/ess.h"
 126 #include "orte/mca/iof/base/iof_base_setup.h"
 127 #include "orte/mca/plm/plm.h"
 128 #include "orte/mca/rtc/rtc.h"
 129 #include "orte/util/name_fns.h"
 130 #include "orte/util/threads.h"
 131 
 132 #include "orte/mca/odls/base/base.h"
 133 #include "orte/mca/odls/base/odls_private.h"
 134 #include "orte/mca/odls/default/odls_default.h"
 135 #include "orte/orted/pmix/pmix_server.h"
 136 
 137 
 138 
 139 
 140 static int orte_odls_default_launch_local_procs(opal_buffer_t *data);
 141 static int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs);
 142 static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
 143 static int orte_odls_default_restart_proc(orte_proc_t *child);
 144 
 145 
 146 
 147 
 148 
 149 static void send_error_show_help(int fd, int exit_status,
 150                                  const char *file, const char *topic, ...)
 151     __opal_attribute_noreturn__;
 152 
 153 static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
 154     __opal_attribute_noreturn__;
 155 
 156 
 157 
 158 
 159 
 160 orte_odls_base_module_t orte_odls_default_module = {
 161     .get_add_procs_data = orte_odls_base_default_get_add_procs_data,
 162     .launch_local_procs = orte_odls_default_launch_local_procs,
 163     .kill_local_procs = orte_odls_default_kill_local_procs,
 164     .signal_local_procs = orte_odls_default_signal_local_procs,
 165     .restart_proc = orte_odls_default_restart_proc
 166 };
 167 
 168 
 169 
 170 static int odls_default_kill_local(pid_t pid, int signum)
 171 {
 172     pid_t pgrp;
 173 
 174 #if HAVE_SETPGID
 175     pgrp = getpgid(pid);
 176     if (-1 != pgrp) {
 177         
 178 
 179 
 180 
 181 
 182 
 183 
 184         pid = -pgrp;
 185     }
 186 #endif
 187 
 188     if (0 != kill(pid, signum)) {
 189         if (ESRCH != errno) {
 190             OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 191                                  "%s odls:default:SENT KILL %d TO PID %d GOT ERRNO %d",
 192                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
 193             return errno;
 194         }
 195     }
 196     OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 197                          "%s odls:default:SENT KILL %d TO PID %d SUCCESS",
 198                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
 199     return 0;
 200 }
 201 
 202 int orte_odls_default_kill_local_procs(opal_pointer_array_t *procs)
 203 {
 204     int rc;
 205 
 206     if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
 207                                             odls_default_kill_local))) {
 208         ORTE_ERROR_LOG(rc);
 209         return rc;
 210     }
 211     return ORTE_SUCCESS;
 212 }
 213 
 214 
 215 static void set_handler_default(int sig)
 216 {
 217     struct sigaction act;
 218 
 219     act.sa_handler = SIG_DFL;
 220     act.sa_flags = 0;
 221     sigemptyset(&act.sa_mask);
 222 
 223     sigaction(sig, &act, (struct sigaction *)0);
 224 }
 225 
 226 
 227 
 228 
 229 
 230 static int write_help_msg(int fd, orte_odls_pipe_err_msg_t *msg, const char *file,
 231                           const char *topic, va_list ap)
 232 {
 233     int ret;
 234     char *str;
 235 
 236     if (NULL == file || NULL == topic) {
 237         return OPAL_ERR_BAD_PARAM;
 238     }
 239 
 240     str = opal_show_help_vstring(file, topic, true, ap);
 241 
 242     msg->file_str_len = (int) strlen(file);
 243     if (msg->file_str_len > ORTE_ODLS_MAX_FILE_LEN) {
 244         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 245         return ORTE_ERR_BAD_PARAM;
 246     }
 247     msg->topic_str_len = (int) strlen(topic);
 248     if (msg->topic_str_len > ORTE_ODLS_MAX_TOPIC_LEN) {
 249         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 250         return ORTE_ERR_BAD_PARAM;
 251     }
 252     msg->msg_str_len = (int) strlen(str);
 253 
 254     
 255     if (OPAL_SUCCESS != (ret = opal_fd_write(fd, sizeof(*msg), msg))) {
 256         goto out;
 257     }
 258     if (msg->file_str_len > 0 &&
 259         OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->file_str_len, file))) {
 260         goto out;
 261     }
 262     if (msg->topic_str_len > 0 &&
 263         OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->topic_str_len, topic))) {
 264         goto out;
 265     }
 266     if (msg->msg_str_len > 0 &&
 267         OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->msg_str_len, str))) {
 268         goto out;
 269     }
 270 
 271  out:
 272     free(str);
 273     return ret;
 274 }
 275 
 276 
 277 
 278 
 279 static void send_error_show_help(int fd, int exit_status,
 280                                  const char *file, const char *topic, ...)
 281 {
 282     va_list ap;
 283     orte_odls_pipe_err_msg_t msg;
 284 
 285     msg.fatal = true;
 286     msg.exit_status = exit_status;
 287 
 288     
 289     va_start(ap, topic);
 290     write_help_msg(fd, &msg, file, topic, ap);
 291     va_end(ap);
 292 
 293     exit(exit_status);
 294 }
 295 
 296 
 297 
 298 static int close_open_file_descriptors(int write_fd,
 299                                       orte_iof_base_io_conf_t opts) {
 300     DIR *dir = opendir("/proc/self/fd");
 301     if (NULL == dir) {
 302         return ORTE_ERR_FILE_OPEN_FAILURE;
 303     }
 304     struct dirent *files;
 305 
 306     
 307 
 308     int dir_scan_fd = dirfd(dir);
 309     if(dir_scan_fd < 0 ) {
 310         return ORTE_ERR_FILE_OPEN_FAILURE;
 311     }
 312 
 313     
 314     while (NULL != (files = readdir(dir))) {
 315         if (!isdigit(files->d_name[0])) {
 316             continue;
 317         }
 318         int fd = strtol(files->d_name, NULL, 10);
 319         if (errno == EINVAL || errno == ERANGE) {
 320             closedir(dir);
 321             return ORTE_ERR_TYPE_MISMATCH;
 322         }
 323         if (fd >=3 &&
 324 #if OPAL_PMIX_V1
 325             fd != opts.p_internal[1] &&
 326 #endif
 327             fd != write_fd && 
 328             fd != dir_scan_fd) {
 329             close(fd);
 330         }
 331     }
 332     closedir(dir);
 333     return ORTE_SUCCESS;
 334 }
 335 
 336 static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
 337 {
 338     int i;
 339     sigset_t sigs;
 340     long fd, fdmax = sysconf(_SC_OPEN_MAX);
 341     char dir[MAXPATHLEN];
 342 
 343 #if HAVE_SETPGID
 344     
 345 
 346     setpgid(0, 0);
 347 #endif
 348 
 349     
 350     opal_fd_set_cloexec(write_fd);
 351 
 352     if (NULL != cd->child) {
 353         
 354 
 355 
 356 
 357 
 358 
 359 
 360 
 361 
 362 
 363 
 364 
 365 
 366         if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
 367             if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
 368                 ORTE_ERROR_LOG(i);
 369                 send_error_show_help(write_fd, 1,
 370                                      "help-orte-odls-default.txt",
 371                                      "iof setup failed",
 372                                      orte_process_info.nodename, cd->app->app);
 373                 
 374             }
 375         }
 376 
 377         
 378         orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
 379 
 380     } else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
 381         
 382         int fdnull;
 383         for (i=0; i < 3; i++) {
 384             fdnull = open("/dev/null", O_RDONLY, 0);
 385             if (fdnull > i && i != write_fd) {
 386                 dup2(fdnull, i);
 387             }
 388             close(fdnull);
 389         }
 390 #if OPAL_PMIX_V1
 391         fdnull = open("/dev/null", O_RDONLY, 0);
 392         if (fdnull > cd->opts.p_internal[1]) {
 393             dup2(fdnull, cd->opts.p_internal[1]);
 394         }
 395         close(fdnull);
 396 #endif
 397     }
 398 
 399     
 400 
 401 
 402     if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
 403         
 404         for(fd=3; fd<fdmax; fd++) {
 405             if (
 406 #if OPAL_PMIX_V1
 407                 fd != cd->opts.p_internal[1] &&
 408 #endif
 409                 fd != write_fd) {
 410                 close(fd);
 411             }
 412         }
 413     }
 414 
 415     if (cd->argv == NULL) {
 416         cd->argv = malloc(sizeof(char*)*2);
 417         cd->argv[0] = strdup(cd->app->app);
 418         cd->argv[1] = NULL;
 419     }
 420 
 421     
 422 
 423 
 424 
 425 
 426 
 427 
 428     set_handler_default(SIGTERM);
 429     set_handler_default(SIGINT);
 430     set_handler_default(SIGHUP);
 431     set_handler_default(SIGPIPE);
 432     set_handler_default(SIGCHLD);
 433 
 434     
 435 
 436 
 437 
 438     sigprocmask(0, 0, &sigs);
 439     sigprocmask(SIG_UNBLOCK, &sigs, 0);
 440 
 441     
 442     if (NULL != cd->wdir) {
 443         if (0 != chdir(cd->wdir)) {
 444             send_error_show_help(write_fd, 1,
 445                                  "help-orterun.txt",
 446                                  "orterun:wdir-not-found",
 447                                  "orted",
 448                                  cd->wdir,
 449                                  orte_process_info.nodename,
 450                                  (NULL == cd->child) ? 0 : cd->child->app_rank);
 451             
 452         }
 453     }
 454 
 455     
 456     execve(cd->cmd, cd->argv, cd->env);
 457     getcwd(dir, sizeof(dir));
 458     send_error_show_help(write_fd, 1,
 459                          "help-orte-odls-default.txt", "execve error",
 460                          orte_process_info.nodename, dir, cd->app->app, strerror(errno));
 461     
 462 }
 463 
 464 
 465 static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
 466 {
 467     int rc;
 468     orte_odls_pipe_err_msg_t msg;
 469     char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
 470 
 471     if (cd->opts.connect_stdin) {
 472         close(cd->opts.p_stdin[0]);
 473     }
 474     close(cd->opts.p_stdout[1]);
 475     if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
 476         close(cd->opts.p_stderr[1]);
 477     }
 478 #if OPAL_PMIX_V1
 479     close(cd->opts.p_internal[1]);
 480 #endif
 481 
 482     
 483     while (1) {
 484         rc = opal_fd_read(read_fd, sizeof(msg), &msg);
 485 
 486         
 487         if (OPAL_ERR_TIMEOUT == rc) {
 488             break;
 489         }
 490 
 491         
 492         if (OPAL_SUCCESS != rc) {
 493             ORTE_ERROR_LOG(rc);
 494             close(read_fd);
 495 
 496             if (NULL != cd->child) {
 497                 cd->child->state = ORTE_PROC_STATE_UNDEF;
 498             }
 499             return rc;
 500         }
 501 
 502         
 503         if (NULL != cd->child) {
 504             if (msg.fatal) {
 505                 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
 506             } else {
 507                 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
 508             }
 509         }
 510 
 511         
 512         if (msg.file_str_len > 0) {
 513             rc = opal_fd_read(read_fd, msg.file_str_len, file);
 514             if (OPAL_SUCCESS != rc) {
 515                 orte_show_help("help-orte-odls-default.txt", "syscall fail",
 516                                true,
 517                                orte_process_info.nodename, cd->app->app,
 518                                "opal_fd_read", __FILE__, __LINE__);
 519                 if (NULL != cd->child) {
 520                     cd->child->state = ORTE_PROC_STATE_UNDEF;
 521                 }
 522                 return rc;
 523             }
 524             file[msg.file_str_len] = '\0';
 525         }
 526         if (msg.topic_str_len > 0) {
 527             rc = opal_fd_read(read_fd, msg.topic_str_len, topic);
 528             if (OPAL_SUCCESS != rc) {
 529                 orte_show_help("help-orte-odls-default.txt", "syscall fail",
 530                                true,
 531                                orte_process_info.nodename, cd->app->app,
 532                                "opal_fd_read", __FILE__, __LINE__);
 533                 if (NULL != cd->child) {
 534                     cd->child->state = ORTE_PROC_STATE_UNDEF;
 535                 }
 536                 return rc;
 537             }
 538             topic[msg.topic_str_len] = '\0';
 539         }
 540         if (msg.msg_str_len > 0) {
 541             str = calloc(1, msg.msg_str_len + 1);
 542             if (NULL == str) {
 543                 orte_show_help("help-orte-odls-default.txt", "syscall fail",
 544                                true,
 545                                orte_process_info.nodename, cd->app->app,
 546                                "opal_fd_read", __FILE__, __LINE__);
 547                 if (NULL != cd->child) {
 548                     cd->child->state = ORTE_PROC_STATE_UNDEF;
 549                 }
 550                 return rc;
 551             }
 552             rc = opal_fd_read(read_fd, msg.msg_str_len, str);
 553         }
 554 
 555         
 556 
 557         if (msg.msg_str_len > 0) {
 558             orte_show_help_norender(file, topic, false, str);
 559             free(str);
 560             str = NULL;
 561         }
 562 
 563         
 564 
 565 
 566 
 567 
 568         if (msg.fatal) {
 569             if (NULL != cd->child) {
 570                 cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
 571                 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
 572             }
 573             close(read_fd);
 574             return ORTE_ERR_FAILED_TO_START;
 575         }
 576     }
 577 
 578     
 579 
 580 
 581     if (NULL != cd->child) {
 582         cd->child->state = ORTE_PROC_STATE_RUNNING;
 583         ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
 584     }
 585     close(read_fd);
 586 
 587     return ORTE_SUCCESS;
 588 }
 589 
 590 
 591 
 592 
 593 
 594 static int odls_default_fork_local_proc(void *cdptr)
 595 {
 596     orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
 597     int p[2];
 598     pid_t pid;
 599     orte_proc_t *child = cd->child;
 600 
 601     
 602 
 603 
 604 
 605 
 606 
 607 
 608 
 609     if (pipe(p) < 0) {
 610         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
 611         if (NULL != child) {
 612             child->state = ORTE_PROC_STATE_FAILED_TO_START;
 613             child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
 614         }
 615         return ORTE_ERR_SYS_LIMITS_PIPES;
 616     }
 617 
 618     
 619     pid = fork();
 620     if (NULL != child) {
 621         child->pid = pid;
 622     }
 623 
 624     if (pid < 0) {
 625         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
 626         if (NULL != child) {
 627             child->state = ORTE_PROC_STATE_FAILED_TO_START;
 628             child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
 629         }
 630         return ORTE_ERR_SYS_LIMITS_CHILDREN;
 631     }
 632 
 633     if (pid == 0) {
 634         close(p[0]);
 635         do_child(cd, p[1]);
 636         
 637     }
 638 
 639     close(p[1]);
 640     return do_parent(cd, p[0]);
 641 }
 642 
 643 
 644 
 645 
 646 
 647 
 648 int orte_odls_default_launch_local_procs(opal_buffer_t *data)
 649 {
 650     int rc;
 651     orte_jobid_t job;
 652 
 653     
 654     if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
 655         OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 656                              "%s odls:default:launch:local failed to construct child list on error %s",
 657                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
 658         return rc;
 659     }
 660 
 661     
 662     ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_default_fork_local_proc);
 663 
 664     return ORTE_SUCCESS;
 665 }
 666 
 667 
 668 
 669 
 670 
 671 
 672 static int send_signal(pid_t pd, int signal)
 673 {
 674     int rc = ORTE_SUCCESS;
 675     pid_t pid;
 676 
 677     if (orte_odls_globals.signal_direct_children_only) {
 678         pid = pd;
 679     } else {
 680 #if HAVE_SETPGID
 681         
 682 
 683         pid = -pd;
 684 #else
 685         pid = pd;
 686 #endif
 687     }
 688 
 689     OPAL_OUTPUT_VERBOSE((1, orte_odls_base_framework.framework_output,
 690                          "%s sending signal %d to pid %ld",
 691                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 692                          signal, (long)pid));
 693 
 694     if (kill(pid, signal) != 0) {
 695         switch(errno) {
 696             case EINVAL:
 697                 rc = ORTE_ERR_BAD_PARAM;
 698                 break;
 699             case ESRCH:
 700                 
 701 
 702 
 703 
 704 
 705                 break;
 706             case EPERM:
 707                 rc = ORTE_ERR_PERM;
 708                 break;
 709             default:
 710                 rc = ORTE_ERROR;
 711         }
 712     }
 713 
 714     return rc;
 715 }
 716 
 717 static int orte_odls_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal)
 718 {
 719     int rc;
 720 
 721     if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) {
 722         ORTE_ERROR_LOG(rc);
 723         return rc;
 724     }
 725     return ORTE_SUCCESS;
 726 }
 727 
 728 static int orte_odls_default_restart_proc(orte_proc_t *child)
 729 {
 730     int rc;
 731 
 732     
 733     if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_default_fork_local_proc))) {
 734         OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 735                              "%s odls:default:restart_proc failed to launch on error %s",
 736                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
 737     }
 738     return rc;
 739 }