This source file includes following definitions.
- odls_alps_kill_local
- orte_odls_alps_kill_local_procs
- set_handler_alps
- write_help_msg
- send_error_show_help
- close_open_file_descriptors
- do_child
- do_parent
- odls_alps_fork_local_proc
- orte_odls_alps_launch_local_procs
- send_signal
- orte_odls_alps_signal_local_procs
- orte_odls_alps_restart_proc
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 
  31 
  32 
  33 
  34 
  35 
  36 
  37 
  38 
  39 
  40 
  41 
  42 
  43 
  44 
  45 
  46 
  47 
  48 
  49 
  50 
  51 
  52 
  53 
  54 
  55 
  56 
  57 
  58 
  59 
  60 
  61 
  62 
  63 
  64 
  65 
  66 
  67 
  68 
  69 
  70 
  71 
  72 #include "orte_config.h"
  73 #include "orte/constants.h"
  74 #include "orte/types.h"
  75 
  76 #include <string.h>
  77 #include <stdlib.h>
  78 #ifdef HAVE_UNISTD_H
  79 #include <unistd.h>
  80 #endif
  81 #include <errno.h>
  82 #ifdef HAVE_SYS_TYPES_H
  83 #include <sys/types.h>
  84 #endif
  85 #ifdef HAVE_SYS_WAIT_H
  86 #include <sys/wait.h>
  87 #endif
  88 #include <signal.h>
  89 #ifdef HAVE_FCNTL_H
  90 #include <fcntl.h>
  91 #endif
  92 #ifdef HAVE_SYS_TIME_H
  93 #include <sys/time.h>
  94 #endif
  95 #ifdef HAVE_SYS_PARAM_H
  96 #include <sys/param.h>
  97 #endif
  98 #ifdef HAVE_NETDB_H
  99 #include <netdb.h>
 100 #endif
 101 #include <stdlib.h>
 102 #ifdef HAVE_SYS_STAT_H
 103 #include <sys/stat.h>
 104 #endif  
 105 #include <stdarg.h>
 106 #ifdef HAVE_SYS_SELECT_H
 107 #include <sys/select.h>
 108 #endif
 109 #ifdef HAVE_DIRENT_H
 110 #include <dirent.h>
 111 #endif
 112 
 113 #include <ctype.h>
 114 
 115 #include "opal/mca/hwloc/hwloc-internal.h"
 116 #include "opal/mca/hwloc/base/base.h"
 117 #include "opal/class/opal_pointer_array.h"
 118 #include "opal/util/opal_environ.h"
 119 #include "opal/util/show_help.h"
 120 #include "opal/util/sys_limits.h"
 121 #include "opal/util/fd.h"
 122 
 123 #include "orte/util/show_help.h"
 124 #include "orte/runtime/orte_wait.h"
 125 #include "orte/runtime/orte_globals.h"
 126 #include "orte/mca/errmgr/errmgr.h"
 127 #include "orte/mca/ess/ess.h"
 128 #include "orte/mca/iof/base/iof_base_setup.h"
 129 #include "orte/mca/plm/plm.h"
 130 #include "orte/mca/rtc/rtc.h"
 131 #include "orte/util/name_fns.h"
 132 
 133 #include "orte/mca/odls/base/base.h"
 134 #include "orte/mca/odls/base/odls_private.h"
 135 #include "orte/mca/odls/alps/odls_alps.h"
 136 #include "orte/orted/pmix/pmix_server.h"
 137 
 138 
 139 
 140 
 141 static int orte_odls_alps_launch_local_procs(opal_buffer_t *data);
 142 static int orte_odls_alps_kill_local_procs(opal_pointer_array_t *procs);
 143 static int orte_odls_alps_signal_local_procs(const orte_process_name_t *proc, int32_t signal);
 144 static int orte_odls_alps_restart_proc(orte_proc_t *child);
 145 
 146 
 147 
 148 
 149 
 150 static void send_error_show_help(int fd, int exit_status,
 151                                  const char *file, const char *topic, ...)
 152     __opal_attribute_noreturn__;
 153 static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
 154     __opal_attribute_noreturn__;
 155 
 156 
 157 
 158 
 159 
 160 orte_odls_base_module_t orte_odls_alps_module = {
 161     orte_odls_base_default_get_add_procs_data,
 162     orte_odls_alps_launch_local_procs,
 163     orte_odls_alps_kill_local_procs,
 164     orte_odls_alps_signal_local_procs,
 165     orte_odls_alps_restart_proc
 166 };
 167 
 168 
 169 static int odls_alps_kill_local(pid_t pid, int signum)
 170 {
 171     pid_t pgrp;
 172 
 173 #if HAVE_SETPGID
 174     pgrp = getpgid(pid);
 175     if (-1 != pgrp) {
 176         
 177 
 178 
 179 
 180 
 181 
 182 
 183         pid = pgrp;
 184     }
 185 #endif
 186     if (0 != kill(pid, signum)) {
 187         if (ESRCH != errno) {
 188             OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 189                                  "%s odls:alps:SENT KILL %d TO PID %d GOT ERRNO %d",
 190                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid, errno));
 191             return errno;
 192         }
 193     }
 194     OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 195                          "%s odls:alps:SENT KILL %d TO PID %d SUCCESS",
 196                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), signum, (int)pid));
 197     return 0;
 198 }
 199 
 200 int orte_odls_alps_kill_local_procs(opal_pointer_array_t *procs)
 201 {
 202     int rc;
 203 
 204     if (ORTE_SUCCESS != (rc = orte_odls_base_default_kill_local_procs(procs,
 205                                                 odls_alps_kill_local))) {
 206         ORTE_ERROR_LOG(rc);
 207         return rc;
 208     }
 209     return ORTE_SUCCESS;
 210 }
 211 
 212 
 213 static void set_handler_alps(int sig)
 214 {
 215     struct sigaction act;
 216 
 217     act.sa_handler = SIG_DFL;
 218     act.sa_flags = 0;
 219     sigemptyset(&act.sa_mask);
 220 
 221     sigaction(sig, &act, (struct sigaction *)0);
 222 }
 223 
 224 
 225 
 226 
 227 
 228 static int write_help_msg(int fd, orte_odls_pipe_err_msg_t *msg, const char *file,
 229                           const char *topic, va_list ap)
 230 {
 231     int ret;
 232     char *str;
 233 
 234     if (NULL == file || NULL == topic) {
 235         return OPAL_ERR_BAD_PARAM;
 236     }
 237 
 238     str = opal_show_help_vstring(file, topic, true, ap);
 239 
 240     msg->file_str_len = (int) strlen(file);
 241     if (msg->file_str_len > ORTE_ODLS_MAX_FILE_LEN) {
 242         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 243         return ORTE_ERR_BAD_PARAM;
 244     }
 245     msg->topic_str_len = (int) strlen(topic);
 246     if (msg->topic_str_len > ORTE_ODLS_MAX_TOPIC_LEN) {
 247         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 248         return ORTE_ERR_BAD_PARAM;
 249     }
 250     msg->msg_str_len = (int) strlen(str);
 251 
 252     
 253     if (OPAL_SUCCESS != (ret = opal_fd_write(fd, sizeof(*msg), msg))) {
 254         goto out;
 255     }
 256     if (msg->file_str_len > 0 &&
 257         OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->file_str_len, file))) {
 258         goto out;
 259     }
 260     if (msg->topic_str_len > 0 &&
 261         OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->topic_str_len, topic))) {
 262         goto out;
 263     }
 264     if (msg->msg_str_len > 0 &&
 265         OPAL_SUCCESS != (ret = opal_fd_write(fd, msg->msg_str_len, str))) {
 266         goto out;
 267     }
 268 
 269  out:
 270     free(str);
 271     return ret;
 272 }
 273 
 274 
 275 
 276 
 277 static void send_error_show_help(int fd, int exit_status,
 278                                  const char *file, const char *topic, ...)
 279 {
 280     va_list ap;
 281     orte_odls_pipe_err_msg_t msg;
 282 
 283     msg.fatal = true;
 284     msg.exit_status = exit_status;
 285 
 286     
 287     va_start(ap, topic);
 288     write_help_msg(fd, &msg, file, topic, ap);
 289     va_end(ap);
 290 
 291     exit(exit_status);
 292 }
 293 
 294 static int close_open_file_descriptors(int write_fd,
 295                                       orte_iof_base_io_conf_t opts) {
 296     DIR *dir = opendir("/proc/self/fd");
 297     if (NULL == dir) {
 298         return ORTE_ERR_FILE_OPEN_FAILURE;
 299     }
 300     struct dirent *files;
 301 
 302     
 303 
 304     int dir_scan_fd = dirfd(dir);
 305     if(dir_scan_fd < 0 ) {
 306         return ORTE_ERR_FILE_OPEN_FAILURE;
 307     }
 308 
 309 
 310     while (NULL != (files = readdir(dir))) {
 311         if (!isdigit(files->d_name[0])) {
 312             continue;
 313         }
 314         int fd = strtol(files->d_name, NULL, 10);
 315         if (errno == EINVAL || errno == ERANGE) {
 316             closedir(dir);
 317             return ORTE_ERR_TYPE_MISMATCH;
 318         }
 319         if (fd >=3 &&
 320 #if OPAL_PMIX_V1
 321             fd != opts.p_internal[1] &&
 322 #endif
 323             fd != write_fd && 
 324             fd != dir_scan_fd) {
 325             close(fd);
 326         }
 327     }
 328     closedir(dir);
 329     return ORTE_SUCCESS;
 330 }
 331 
 332 static int do_child(orte_odls_spawn_caddy_t *cd, int write_fd)
 333 {
 334     int i;
 335     sigset_t sigs;
 336 
 337     
 338     opal_fd_set_cloexec(write_fd);
 339 
 340     if (NULL != cd->child) {
 341         
 342 
 343 
 344 
 345 
 346 
 347 
 348 
 349 
 350 
 351 
 352 
 353 
 354 
 355         if (ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
 356             if (ORTE_SUCCESS != (i = orte_iof_base_setup_child(&cd->opts, &cd->env))) {
 357                 ORTE_ERROR_LOG(i);
 358                 send_error_show_help(write_fd, 1,
 359                                      "help-orte-odls-alps.txt",
 360                                      "iof setup failed",
 361                                      orte_process_info.nodename, cd->app->app);
 362                 
 363             }
 364         }
 365 
 366 
 367         
 368         orte_rtc.set(cd->jdata, cd->child, &cd->env, write_fd);
 369 
 370     } else if (!ORTE_FLAG_TEST(cd->jdata, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
 371         
 372         int fdnull;
 373         for (i=0; i < 3; i++) {
 374             fdnull = open("/dev/null", O_RDONLY, 0);
 375             if (fdnull > i && i != write_fd) {
 376                 dup2(fdnull, i);
 377             }
 378             close(fdnull);
 379         }
 380 #if OPAL_PMIX_V1
 381         fdnull = open("/dev/null", O_RDONLY, 0);
 382         if (fdnull > cd->opts.p_internal[1]) {
 383             dup2(fdnull, cd->opts.p_internal[1]);
 384         }
 385         close(fdnull);
 386 #endif
 387     }
 388 
 389     if (ORTE_SUCCESS != close_open_file_descriptors(write_fd, cd->opts)) {
 390         send_error_show_help(write_fd, 1, "help-orte-odls-alps.txt",
 391                              "close fds",
 392                              orte_process_info.nodename, cd->app->app,
 393                              __FILE__, __LINE__);
 394     }
 395 
 396 
 397     if (cd->argv == NULL) {
 398         cd->argv = malloc(sizeof(char*)*2);
 399         cd->argv[0] = strdup(cd->app->app);
 400         cd->argv[1] = NULL;
 401     }
 402 
 403     
 404 
 405 
 406 
 407 
 408 
 409 
 410     set_handler_alps(SIGTERM);
 411     set_handler_alps(SIGINT);
 412     set_handler_alps(SIGHUP);
 413     set_handler_alps(SIGPIPE);
 414     set_handler_alps(SIGCHLD);
 415 
 416     
 417 
 418 
 419 
 420     sigprocmask(0, 0, &sigs);
 421     sigprocmask(SIG_UNBLOCK, &sigs, 0);
 422 
 423     
 424     if (NULL != cd->wdir) {
 425         if (0 != chdir(cd->wdir)) {
 426             send_error_show_help(write_fd, 1,
 427                                  "help-orterun.txt",
 428                                  "orterun:wdir-not-found",
 429                                  "orted",
 430                                  cd->wdir,
 431                                  orte_process_info.nodename,
 432                                  (NULL == cd->child) ? 0 : cd->child->app_rank);
 433             
 434         }
 435     }
 436 
 437     
 438 
 439     if (10 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
 440         int jout;
 441         opal_output(0, "%s STARTING %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cd->app->app);
 442         for (jout=0; NULL != cd->argv[jout]; jout++) {
 443             opal_output(0, "%s\tARGV[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->argv[jout]);
 444         }
 445         for (jout=0; NULL != cd->env[jout]; jout++) {
 446             opal_output(0, "%s\tENVIRON[%d]: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), jout, cd->env[jout]);
 447         }
 448     }
 449 
 450     execve(cd->cmd, cd->argv, cd->env);
 451     send_error_show_help(write_fd, 1,
 452                          "help-orte-odls-alps.txt", "execve error",
 453                          orte_process_info.nodename, cd->app->app, strerror(errno));
 454     
 455 }
 456 
 457 
 458 static int do_parent(orte_odls_spawn_caddy_t *cd, int read_fd)
 459 {
 460     int rc;
 461     orte_odls_pipe_err_msg_t msg;
 462     char file[ORTE_ODLS_MAX_FILE_LEN + 1], topic[ORTE_ODLS_MAX_TOPIC_LEN + 1], *str = NULL;
 463 
 464     if (cd->opts.connect_stdin) {
 465         close(cd->opts.p_stdin[0]);
 466     }
 467     close(cd->opts.p_stdout[1]);
 468     if( !orte_iof_base.redirect_app_stderr_to_stdout ) {
 469         close(cd->opts.p_stderr[1]);
 470     }
 471 #if OPAL_PMIX_V1
 472     close(cd->opts.p_internal[1]);
 473 #endif
 474 
 475     
 476     while (1) {
 477         rc = opal_fd_read(read_fd, sizeof(msg), &msg);
 478 
 479         
 480         if (OPAL_ERR_TIMEOUT == rc) {
 481             break;
 482         }
 483 
 484         
 485         if (OPAL_SUCCESS != rc) {
 486             ORTE_ERROR_LOG(rc);
 487             close(read_fd);
 488 
 489             if (NULL != cd->child) {
 490                 cd->child->state = ORTE_PROC_STATE_UNDEF;
 491             }
 492             return rc;
 493         }
 494 
 495         
 496         if (NULL != cd->child) {
 497             if (msg.fatal) {
 498                 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
 499             } else {
 500                 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
 501             }
 502         }
 503 
 504         
 505         if (msg.file_str_len > 0) {
 506             rc = opal_fd_read(read_fd, msg.file_str_len, file);
 507             if (OPAL_SUCCESS != rc) {
 508                 orte_show_help("help-orte-odls-alps.txt", "syscall fail",
 509                                true,
 510                                orte_process_info.nodename, cd->app,
 511                                "opal_fd_read", __FILE__, __LINE__);
 512                 if (NULL != cd->child) {
 513                     cd->child->state = ORTE_PROC_STATE_UNDEF;
 514                 }
 515                 return rc;
 516             }
 517             file[msg.file_str_len] = '\0';
 518         }
 519         if (msg.topic_str_len > 0) {
 520             rc = opal_fd_read(read_fd, msg.topic_str_len, topic);
 521             if (OPAL_SUCCESS != rc) {
 522                 orte_show_help("help-orte-odls-alps.txt", "syscall fail",
 523                                true,
 524                                orte_process_info.nodename, cd->app,
 525                                "opal_fd_read", __FILE__, __LINE__);
 526                 if (NULL != cd->child) {
 527                     cd->child->state = ORTE_PROC_STATE_UNDEF;
 528                 }
 529                 return rc;
 530             }
 531             topic[msg.topic_str_len] = '\0';
 532         }
 533         if (msg.msg_str_len > 0) {
 534             str = calloc(1, msg.msg_str_len + 1);
 535             if (NULL == str) {
 536                 orte_show_help("help-orte-odls-alps.txt", "syscall fail",
 537                                true,
 538                                orte_process_info.nodename, cd->app,
 539                                "opal_fd_read", __FILE__, __LINE__);
 540                 if (NULL != cd->child) {
 541                     cd->child->state = ORTE_PROC_STATE_UNDEF;
 542                 }
 543                 return rc;
 544             }
 545             rc = opal_fd_read(read_fd, msg.msg_str_len, str);
 546         }
 547 
 548         
 549 
 550         if (msg.msg_str_len > 0) {
 551             orte_show_help_norender(file, topic, false, str);
 552             free(str);
 553             str = NULL;
 554         }
 555 
 556         
 557 
 558 
 559 
 560 
 561         if (msg.fatal) {
 562             if (NULL != cd->child) {
 563                 cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
 564                 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
 565             }
 566             close(read_fd);
 567             return ORTE_ERR_FAILED_TO_START;
 568         }
 569     }
 570 
 571     
 572 
 573 
 574     if (NULL != cd->child) {
 575         cd->child->state = ORTE_PROC_STATE_RUNNING;
 576         ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_ALIVE);
 577     }
 578     close(read_fd);
 579 
 580     return ORTE_SUCCESS;
 581 }
 582 
 583 
 584 
 585 
 586 
 587 static int odls_alps_fork_local_proc(void *cdptr)
 588 {
 589     orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cdptr;
 590     int p[2];
 591     pid_t pid;
 592 
 593     
 594 
 595 
 596 
 597 
 598 
 599 
 600 
 601     if (pipe(p) < 0) {
 602         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_PIPES);
 603         if (NULL != cd->child) {
 604             cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
 605             cd->child->exit_code = ORTE_ERR_SYS_LIMITS_PIPES;
 606         }
 607         return ORTE_ERR_SYS_LIMITS_PIPES;
 608     }
 609 
 610     
 611     pid = fork();
 612     if (NULL != cd->child) {
 613         cd->child->pid = pid;
 614     }
 615 
 616     if (pid < 0) {
 617         ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
 618         if (NULL != cd->child) {
 619             cd->child->state = ORTE_PROC_STATE_FAILED_TO_START;
 620             cd->child->exit_code = ORTE_ERR_SYS_LIMITS_CHILDREN;
 621         }
 622         return ORTE_ERR_SYS_LIMITS_CHILDREN;
 623     }
 624 
 625     if (pid == 0) {
 626         close(p[0]);
 627 #if HAVE_SETPGID
 628         setpgid(0, 0);
 629 #endif
 630         do_child(cd, p[1]);
 631         
 632     }
 633 
 634     close(p[1]);
 635     return do_parent(cd, p[0]);
 636 }
 637 
 638 
 639 
 640 
 641 
 642 
 643 int orte_odls_alps_launch_local_procs(opal_buffer_t *data)
 644 {
 645     orte_jobid_t job;
 646     int rc;
 647 
 648     
 649     if (ORTE_SUCCESS != (rc = orte_odls_base_default_construct_child_list(data, &job))) {
 650         OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 651                              "%s odls:alps:launch:local failed to construct child list on error %s",
 652                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
 653         return rc;
 654     }
 655 
 656     
 657 
 658     if (ORTE_SUCCESS != (rc = orte_odls_alps_get_rdma_creds())) {;
 659         OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 660                              "%s odls:alps:launch:failed to get GNI rdma credentials %s",
 661                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
 662         return rc;
 663     }
 664 
 665     
 666     ORTE_ACTIVATE_LOCAL_LAUNCH(job, odls_alps_fork_local_proc);
 667 
 668     return ORTE_SUCCESS;
 669 }
 670 
 671 
 672 
 673 
 674 
 675 
 676 static int send_signal(pid_t pid, int signal)
 677 {
 678     int rc = ORTE_SUCCESS;
 679 
 680     OPAL_OUTPUT_VERBOSE((1, orte_odls_base_framework.framework_output,
 681                          "%s sending signal %d to pid %ld",
 682                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 683                          signal, (long)pid));
 684 
 685     if (kill(pid, signal) != 0) {
 686         switch(errno) {
 687             case EINVAL:
 688                 rc = ORTE_ERR_BAD_PARAM;
 689                 break;
 690             case ESRCH:
 691                 
 692 
 693 
 694 
 695 
 696                 break;
 697             case EPERM:
 698                 rc = ORTE_ERR_PERM;
 699                 break;
 700             default:
 701                 rc = ORTE_ERROR;
 702         }
 703     }
 704 
 705     return rc;
 706 }
 707 
 708 static int orte_odls_alps_signal_local_procs(const orte_process_name_t *proc, int32_t signal)
 709 {
 710     int rc;
 711 
 712     if (ORTE_SUCCESS != (rc = orte_odls_base_default_signal_local_procs(proc, signal, send_signal))) {
 713         ORTE_ERROR_LOG(rc);
 714         return rc;
 715     }
 716     return ORTE_SUCCESS;
 717 }
 718 
 719 static int orte_odls_alps_restart_proc(orte_proc_t *child)
 720 {
 721     int rc;
 722 
 723     
 724     if (ORTE_SUCCESS != (rc = orte_odls_base_default_restart_proc(child, odls_alps_fork_local_proc))) {
 725         OPAL_OUTPUT_VERBOSE((2, orte_odls_base_framework.framework_output,
 726                              "%s odls:alps:restart_proc failed to launch on error %s",
 727                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_ERROR_NAME(rc)));
 728     }
 729     return rc;
 730 }