This source file includes following definitions.
- caddy_const
- caddy_dest
- rsh_init
- rsh_wait_daemon
- setup_launch
- ssh_child
- remote_spawn
- rsh_launch
- process_launch_list
- launch_daemons
- rsh_terminate_orteds
- rsh_finalize
- set_handler_default
- find_shell
- launch_agent_setup
- rsh_probe
- setup_shell
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 
  31 
  32 
  33 #include "orte_config.h"
  34 #include "orte/constants.h"
  35 
  36 #include <stdlib.h>
  37 #ifdef HAVE_UNISTD_H
  38 #include <unistd.h>
  39 #endif
  40 #include <errno.h>
  41 #include <string.h>
  42 #ifdef HAVE_STRINGS_H
  43 #include <strings.h>
  44 #endif
  45 #ifdef HAVE_SYS_SELECT_H
  46 #include <sys/select.h>
  47 #endif
  48 #ifdef HAVE_SYS_TIME_H
  49 #include <sys/time.h>
  50 #endif
  51 #include <time.h>
  52 #ifdef HAVE_SYS_TYPES_H
  53 #include <sys/types.h>
  54 #endif
  55 #ifdef HAVE_SYS_STAT_H
  56 #include <sys/stat.h>
  57 #endif
  58 #ifdef HAVE_SYS_WAIT_H
  59 #include <sys/wait.h>
  60 #endif
  61 #include <fcntl.h>
  62 #include <signal.h>
  63 #ifdef HAVE_PWD_H
  64 #include <pwd.h>
  65 #endif
  66 
  67 #include "opal/mca/installdirs/installdirs.h"
  68 #include "opal/util/output.h"
  69 #include "opal/mca/base/base.h"
  70 #include "opal/mca/event/event.h"
  71 #include "opal/util/argv.h"
  72 #include "opal/util/opal_environ.h"
  73 #include "opal/util/basename.h"
  74 #include "opal/util/path.h"
  75 #include "opal/class/opal_pointer_array.h"
  76 
  77 #include "orte/util/show_help.h"
  78 #include "orte/runtime/orte_wait.h"
  79 #include "orte/runtime/orte_globals.h"
  80 #include "orte/util/name_fns.h"
  81 #include "orte/util/proc_info.h"
  82 #include "orte/util/threads.h"
  83 
  84 #include "orte/mca/rml/rml.h"
  85 #include "orte/mca/rml/rml_types.h"
  86 #include "orte/mca/ess/ess.h"
  87 #include "orte/mca/ess/base/base.h"
  88 #include "orte/mca/errmgr/errmgr.h"
  89 #include "orte/mca/grpcomm/base/base.h"
  90 #include "orte/mca/oob/base/base.h"
  91 #include "orte/mca/rmaps/rmaps.h"
  92 #include "orte/mca/routed/routed.h"
  93 #include "orte/mca/rml/base/rml_contact.h"
  94 #include "orte/mca/state/state.h"
  95 
  96 #include "orte/mca/plm/plm.h"
  97 #include "orte/mca/plm/base/base.h"
  98 #include "orte/mca/plm/base/plm_private.h"
  99 #include "orte/mca/plm/rsh/plm_rsh.h"
 100 
 101 static int rsh_init(void);
 102 static int rsh_launch(orte_job_t *jdata);
 103 static int remote_spawn(void);
 104 static int rsh_terminate_orteds(void);
 105 static int rsh_finalize(void);
 106 
 107 orte_plm_base_module_t orte_plm_rsh_module = {
 108     rsh_init,
 109     orte_plm_base_set_hnp_name,
 110     rsh_launch,
 111     remote_spawn,
 112     orte_plm_base_orted_terminate_job,
 113     rsh_terminate_orteds,
 114     orte_plm_base_orted_kill_local_procs,
 115     orte_plm_base_orted_signal_local_procs,
 116     rsh_finalize
 117 };
 118 
 119 typedef struct {
 120     opal_list_item_t super;
 121     int argc;
 122     char **argv;
 123     orte_proc_t *daemon;
 124 } orte_plm_rsh_caddy_t;
 125 static void caddy_const(orte_plm_rsh_caddy_t *ptr)
 126 {
 127     ptr->argv = NULL;
 128     ptr->daemon = NULL;
 129 }
 130 static void caddy_dest(orte_plm_rsh_caddy_t *ptr)
 131 {
 132     if (NULL != ptr->argv) {
 133         opal_argv_free(ptr->argv);
 134     }
 135     if (NULL != ptr->daemon) {
 136         OBJ_RELEASE(ptr->daemon);
 137     }
 138 }
 139 OBJ_CLASS_INSTANCE(orte_plm_rsh_caddy_t,
 140                    opal_list_item_t,
 141                    caddy_const, caddy_dest);
 142 
 143 typedef enum {
 144     ORTE_PLM_RSH_SHELL_BASH = 0,
 145     ORTE_PLM_RSH_SHELL_ZSH,
 146     ORTE_PLM_RSH_SHELL_TCSH,
 147     ORTE_PLM_RSH_SHELL_CSH,
 148     ORTE_PLM_RSH_SHELL_KSH,
 149     ORTE_PLM_RSH_SHELL_SH,
 150     ORTE_PLM_RSH_SHELL_UNKNOWN
 151 } orte_plm_rsh_shell_t;
 152 
 153 
 154 static const char *orte_plm_rsh_shell_name[7] = {
 155     "bash",
 156     "zsh",
 157     "tcsh",       
 158     "csh",
 159     "ksh",
 160     "sh",
 161     "unknown"
 162 };
 163 
 164 
 165 
 166 
 167 static void set_handler_default(int sig);
 168 static orte_plm_rsh_shell_t find_shell(char *shell);
 169 static int launch_agent_setup(const char *agent, char *path);
 170 static void ssh_child(int argc, char **argv) __opal_attribute_noreturn__;
 171 static int rsh_probe(char *nodename,
 172                      orte_plm_rsh_shell_t *shell);
 173 static int setup_shell(orte_plm_rsh_shell_t *rshell,
 174                        orte_plm_rsh_shell_t *lshell,
 175                        char *nodename, int *argc, char ***argv);
 176 static void launch_daemons(int fd, short args, void *cbdata);
 177 static void process_launch_list(int fd, short args, void *cbdata);
 178 
 179 
 180 static int num_in_progress=0;
 181 static opal_list_t launch_list;
 182 static opal_event_t launch_event;
 183 static char *rsh_agent_path=NULL;
 184 static char **rsh_agent_argv=NULL;
 185 
 186 
 187 
 188 
 189 static int rsh_init(void)
 190 {
 191     char *tmp;
 192     int rc;
 193 
 194     
 195     if (mca_plm_rsh_component.using_qrsh) {
 196         
 197         opal_asprintf(&tmp, "%s/bin/%s", getenv("SGE_ROOT"), getenv("ARC"));
 198         if (ORTE_SUCCESS != (rc = launch_agent_setup("qrsh", tmp))) {
 199             ORTE_ERROR_LOG(rc);
 200             free(tmp);
 201             return rc;
 202         }
 203         free(tmp);
 204         
 205         opal_argv_append_nosize(&rsh_agent_argv, "-inherit");
 206         
 207 
 208         opal_argv_append_nosize(&rsh_agent_argv, "-nostdin");
 209         opal_argv_append_nosize(&rsh_agent_argv, "-V");
 210         if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
 211             opal_argv_append_nosize(&rsh_agent_argv, "-verbose");
 212             tmp = opal_argv_join(rsh_agent_argv, ' ');
 213             opal_output_verbose(1, orte_plm_base_framework.framework_output,
 214                                 "%s plm:rsh: using \"%s\" for launching\n",
 215                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
 216             free(tmp);
 217         }
 218     } else if(mca_plm_rsh_component.using_llspawn) {
 219         
 220         if (ORTE_SUCCESS != (rc = launch_agent_setup("llspawn", NULL))) {
 221             ORTE_ERROR_LOG(rc);
 222             return rc;
 223         }
 224         opal_output_verbose(1, orte_plm_base_framework.framework_output,
 225                             "%s plm:rsh: using \"%s\" for launching\n",
 226                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 227                             rsh_agent_path);
 228     } else {
 229         
 230         if (ORTE_SUCCESS != (rc = launch_agent_setup(mca_plm_rsh_component.agent, NULL))) {
 231             ORTE_ERROR_LOG(rc);
 232             return rc;
 233         }
 234     }
 235 
 236     
 237     if (ORTE_SUCCESS != (rc = orte_state.add_job_state(ORTE_JOB_STATE_LAUNCH_DAEMONS,
 238                                                        launch_daemons, ORTE_SYS_PRI))) {
 239         ORTE_ERROR_LOG(rc);
 240         return rc;
 241     }
 242 
 243     
 244     OBJ_CONSTRUCT(&launch_list, opal_list_t);
 245     opal_event_set(orte_event_base, &launch_event, -1, 0, process_launch_list, NULL);
 246     opal_event_set_priority(&launch_event, ORTE_SYS_PRI);
 247 
 248     
 249     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_start())) {
 250         ORTE_ERROR_LOG(rc);
 251     }
 252 
 253     
 254     orte_plm_globals.daemon_nodes_assigned_at_launch = true;
 255 
 256     return rc;
 257 }
 258 
 259 
 260 
 261 
 262 static void rsh_wait_daemon(int sd, short flags, void *cbdata)
 263 {
 264     orte_job_t *jdata;
 265     orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
 266     orte_plm_rsh_caddy_t *caddy=(orte_plm_rsh_caddy_t*)t2->cbdata;
 267     orte_proc_t *daemon = caddy->daemon;
 268 
 269     if (orte_orteds_term_ordered || orte_abnormal_term_ordered) {
 270         
 271 
 272 
 273         OBJ_RELEASE(caddy);
 274         OBJ_RELEASE(t2);
 275         return;
 276     }
 277 
 278     if (!WIFEXITED(daemon->exit_code) ||
 279         WEXITSTATUS(daemon->exit_code) != 0) { 
 280         
 281 
 282 
 283         if (!ORTE_PROC_IS_HNP) {
 284             opal_buffer_t *buf;
 285             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 286                                  "%s daemon %d failed with status %d",
 287                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 288                                  (int)daemon->name.vpid, WEXITSTATUS(daemon->exit_code)));
 289             buf = OBJ_NEW(opal_buffer_t);
 290             opal_dss.pack(buf, &(daemon->name.vpid), 1, ORTE_VPID);
 291             opal_dss.pack(buf, &daemon->exit_code, 1, OPAL_INT);
 292             orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
 293                                     ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
 294                                     orte_rml_send_callback, NULL);
 295             
 296             daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
 297         } else {
 298             jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 299 
 300             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 301                                  "%s daemon %d failed with status %d",
 302                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 303                                  (int)daemon->name.vpid, WEXITSTATUS(daemon->exit_code)));
 304             
 305             ORTE_UPDATE_EXIT_STATUS(WEXITSTATUS(daemon->exit_code));
 306             
 307             daemon->state = ORTE_PROC_STATE_FAILED_TO_START;
 308             
 309             jdata->num_terminated++;
 310             
 311 
 312 
 313             orte_routed.route_lost(&daemon->name);
 314             
 315             ORTE_ACTIVATE_PROC_STATE(&daemon->name, ORTE_PROC_STATE_FAILED_TO_START);
 316         }
 317     }
 318 
 319     
 320     --num_in_progress;
 321     if (num_in_progress < mca_plm_rsh_component.num_concurrent) {
 322         
 323         opal_event_active(&launch_event, EV_WRITE, 1);
 324     }
 325     
 326     OBJ_RELEASE(t2);
 327 }
 328 
 329 static int setup_launch(int *argcptr, char ***argvptr,
 330                         char *nodename,
 331                         int *node_name_index1,
 332                         int *proc_vpid_index, char *prefix_dir)
 333 {
 334     int argc;
 335     char **argv;
 336     char *param, *value;
 337     orte_plm_rsh_shell_t remote_shell, local_shell;
 338     int orted_argc;
 339     char **orted_argv;
 340     char *orted_cmd, *orted_prefix, *final_cmd;
 341     int orted_index;
 342     int rc;
 343     int i, j;
 344     bool found;
 345     char *lib_base=NULL, *bin_base=NULL;
 346     char *opal_prefix = getenv("OPAL_PREFIX");
 347     char* full_orted_cmd = NULL;
 348 
 349     
 350 
 351 
 352 
 353 
 354 
 355 
 356 
 357 
 358 
 359 
 360 
 361 
 362 
 363 
 364 
 365 
 366 
 367 
 368 
 369 
 370 
 371 
 372 
 373 
 374 
 375 
 376     
 377 
 378 
 379     argv = opal_argv_copy(rsh_agent_argv);
 380     argc = opal_argv_count(argv);
 381     
 382     if (NULL != mca_plm_rsh_component.ssh_args) {
 383         char **ssh_argv;
 384         ssh_argv = opal_argv_split(mca_plm_rsh_component.ssh_args, ' ');
 385         for (i=0; NULL != ssh_argv[i]; i++) {
 386             opal_argv_append(&argc, &argv, ssh_argv[i]);
 387         }
 388         opal_argv_free(ssh_argv);
 389     }
 390     *node_name_index1 = argc;
 391     opal_argv_append(&argc, &argv, "<template>");
 392 
 393     
 394     if (ORTE_SUCCESS != (rc = setup_shell(&remote_shell, &local_shell,
 395                                           nodename, &argc, &argv))) {
 396         ORTE_ERROR_LOG(rc);
 397         return rc;
 398     }
 399 
 400     
 401 
 402 
 403 
 404 
 405 
 406 
 407 
 408     orted_argc = 0;
 409     orted_argv = NULL;
 410     orted_index = orte_plm_base_setup_orted_cmd(&orted_argc, &orted_argv);
 411 
 412     
 413 
 414 
 415 
 416 
 417 
 418 
 419 
 420 
 421 
 422 
 423 
 424 
 425 
 426 
 427 
 428 
 429 
 430 
 431 
 432 
 433     if (0 == orted_index) {
 434         
 435 
 436 
 437 
 438 
 439         orted_cmd = opal_argv_join(orted_argv, ' ');
 440         orted_prefix = NULL;
 441     } else {
 442         
 443 
 444 
 445         orted_prefix = opal_argv_join_range(orted_argv, 0, orted_index, ' ');
 446         orted_cmd = opal_argv_join_range(orted_argv, orted_index, opal_argv_count(orted_argv), ' ');
 447     }
 448     opal_argv_free(orted_argv);  
 449 
 450     
 451     param = opal_basename(opal_install_dirs.libdir);
 452     if (NULL != mca_plm_rsh_component.pass_libpath) {
 453         if (NULL != prefix_dir) {
 454             opal_asprintf(&lib_base, "%s:%s/%s", mca_plm_rsh_component.pass_libpath, prefix_dir, param);
 455         } else {
 456             opal_asprintf(&lib_base, "%s:%s", mca_plm_rsh_component.pass_libpath, param);
 457         }
 458     } else if (NULL != prefix_dir) {
 459         opal_asprintf(&lib_base, "%s/%s", prefix_dir, param);
 460     }
 461     free(param);
 462 
 463     
 464 
 465 
 466     if (NULL != prefix_dir) {
 467         
 468 
 469 
 470 
 471 
 472         value = opal_basename(opal_install_dirs.bindir);
 473         opal_asprintf(&bin_base, "%s/%s", prefix_dir, value);
 474         free(value);
 475 
 476         if (NULL != orted_cmd) {
 477             if (0 == strcmp(orted_cmd, "orted")) {
 478                 
 479                 opal_asprintf(&full_orted_cmd, "%s/%s", bin_base, orted_cmd);
 480             } else {
 481                 
 482                 full_orted_cmd = strdup(orted_cmd);
 483             }
 484             free(orted_cmd);
 485         }
 486     } else {
 487         full_orted_cmd = orted_cmd;
 488     }
 489 
 490     if (NULL != lib_base || NULL != bin_base) {
 491         if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
 492             ORTE_PLM_RSH_SHELL_KSH == remote_shell ||
 493             ORTE_PLM_RSH_SHELL_ZSH == remote_shell ||
 494             ORTE_PLM_RSH_SHELL_BASH == remote_shell) {
 495             
 496 
 497 
 498 
 499             opal_asprintf (&final_cmd,
 500                             "%s%s%s PATH=%s%s$PATH ; export PATH ; "
 501                             "LD_LIBRARY_PATH=%s%s$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; "
 502                             "DYLD_LIBRARY_PATH=%s%s$DYLD_LIBRARY_PATH ; export DYLD_LIBRARY_PATH ; "
 503                             "%s %s",
 504                             (opal_prefix != NULL ? "OPAL_PREFIX=" : " "),
 505                             (opal_prefix != NULL ? opal_prefix : " "),
 506                             (opal_prefix != NULL ? " ; export OPAL_PREFIX;" : " "),
 507                             (NULL != bin_base ? bin_base : " "),
 508                             (NULL != bin_base ? ":" : " "),
 509                             (NULL != lib_base ? lib_base : " "),
 510                             (NULL != lib_base ? ":" : " "),
 511                             (NULL != lib_base ? lib_base : " "),
 512                             (NULL != lib_base ? ":" : " "),
 513                             (orted_prefix != NULL ? orted_prefix : " "),
 514                             (full_orted_cmd != NULL ? full_orted_cmd : " "));
 515         } else if (ORTE_PLM_RSH_SHELL_TCSH == remote_shell ||
 516                    ORTE_PLM_RSH_SHELL_CSH == remote_shell) {
 517             
 518 
 519 
 520 
 521 
 522 
 523 
 524 
 525             
 526 
 527 
 528 
 529             opal_asprintf (&final_cmd,
 530                             "%s%s%s set path = ( %s $path ) ; "
 531                             "if ( $?LD_LIBRARY_PATH == 1 ) "
 532                             "set OMPI_have_llp ; "
 533                             "if ( $?LD_LIBRARY_PATH == 0 ) "
 534                             "setenv LD_LIBRARY_PATH %s ; "
 535                             "if ( $?OMPI_have_llp == 1 ) "
 536                             "setenv LD_LIBRARY_PATH %s%s$LD_LIBRARY_PATH ; "
 537                             "if ( $?DYLD_LIBRARY_PATH == 1 ) "
 538                             "set OMPI_have_dllp ; "
 539                             "if ( $?DYLD_LIBRARY_PATH == 0 ) "
 540                             "setenv DYLD_LIBRARY_PATH %s ; "
 541                             "if ( $?OMPI_have_dllp == 1 ) "
 542                             "setenv DYLD_LIBRARY_PATH %s%s$DYLD_LIBRARY_PATH ; "
 543                             "%s %s",
 544                             (opal_prefix != NULL ? "setenv OPAL_PREFIX " : " "),
 545                             (opal_prefix != NULL ? opal_prefix : " "),
 546                             (opal_prefix != NULL ? " ;" : " "),
 547                             (NULL != bin_base ? bin_base : " "),
 548                             (NULL != lib_base ? lib_base : " "),
 549                             (NULL != lib_base ? lib_base : " "),
 550                             (NULL != lib_base ? ":" : " "),
 551                             (NULL != lib_base ? lib_base : " "),
 552                             (NULL != lib_base ? lib_base : " "),
 553                             (NULL != lib_base ? ":" : " "),
 554                             (orted_prefix != NULL ? orted_prefix : " "),
 555                             (full_orted_cmd != NULL ? full_orted_cmd : " "));
 556         } else {
 557             orte_show_help("help-plm-rsh.txt", "cannot-resolve-shell-with-prefix", true,
 558                            (NULL == opal_prefix) ? "NULL" : opal_prefix,
 559                            prefix_dir);
 560             if (NULL != bin_base) {
 561                 free(bin_base);
 562             }
 563             if (NULL != lib_base) {
 564                 free(lib_base);
 565             }
 566             if (NULL != orted_prefix) free(orted_prefix);
 567             if (NULL != full_orted_cmd) free(full_orted_cmd);
 568             return ORTE_ERR_SILENT;
 569         }
 570         if (NULL != bin_base) {
 571             free(bin_base);
 572         }
 573         if (NULL != lib_base) {
 574             free(lib_base);
 575         }
 576         if( NULL != full_orted_cmd ) {
 577             free(full_orted_cmd);
 578         }
 579     } else {
 580         
 581         opal_asprintf(&final_cmd, "%s %s",
 582                        (orted_prefix != NULL ? orted_prefix : ""),
 583                        (full_orted_cmd != NULL ? full_orted_cmd : ""));
 584         if (NULL != full_orted_cmd) {
 585             free(full_orted_cmd);
 586         }
 587     }
 588     
 589     opal_argv_append(&argc, &argv, final_cmd);
 590     free(final_cmd);  
 591     if (NULL != orted_prefix) free(orted_prefix);
 592 
 593     
 594 
 595 
 596     if (mca_plm_rsh_component.no_tree_spawn &&
 597         !orte_debug_flag &&
 598         !orte_debug_daemons_flag &&
 599         !orte_debug_daemons_file_flag &&
 600         !orte_leave_session_attached &&
 601         
 602 
 603         ((!mca_plm_rsh_component.using_qrsh) ||
 604          (mca_plm_rsh_component.using_qrsh && mca_plm_rsh_component.daemonize_qrsh)) &&
 605         ((!mca_plm_rsh_component.using_llspawn) ||
 606          (mca_plm_rsh_component.using_llspawn && mca_plm_rsh_component.daemonize_llspawn))) {
 607     }
 608 
 609     
 610 
 611 
 612 
 613     orte_plm_base_orted_append_basic_args(&argc, &argv,
 614                                           "env",
 615                                           proc_vpid_index);
 616 
 617     
 618     opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
 619     opal_argv_append(&argc, &argv, "plm");
 620     opal_argv_append(&argc, &argv, "rsh");
 621 
 622     
 623 
 624     if (!mca_plm_rsh_component.no_tree_spawn) {
 625         opal_argv_append(&argc, &argv, "--tree-spawn");
 626         orte_oob_base_get_addr(¶m);
 627         opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
 628         opal_argv_append(&argc, &argv, "orte_parent_uri");
 629         opal_argv_append(&argc, &argv, param);
 630         free(param);
 631     }
 632 
 633     
 634     if (mca_plm_rsh_component.pass_environ_mca_params) {
 635         
 636 
 637 
 638         for (i = 0; NULL != environ[i]; ++i) {
 639             if (0 == strncmp(OPAL_MCA_PREFIX"mca_base_env_list", environ[i],
 640                              strlen(OPAL_MCA_PREFIX"mca_base_env_list"))) {
 641                 
 642                 continue;
 643             }
 644             if (0 == strncmp(OPAL_MCA_PREFIX, environ[i], 9)) {
 645                 
 646 
 647 
 648 
 649 
 650 
 651                 param = strdup(&environ[i][9]);
 652                 value = strchr(param, '=');
 653                 *value = '\0';
 654                 value++;
 655                 found = false;
 656                 
 657                 for (j=0; NULL != argv[j]; j++) {
 658                     if (0 == strcmp(param, argv[j])) {
 659                         found = true;
 660                         break;
 661                     }
 662                 }
 663                 if (!found) {
 664                     
 665                     opal_argv_append(&argc, &argv, "-"OPAL_MCA_CMD_LINE_ID);
 666                     opal_argv_append(&argc, &argv, param);
 667                     opal_argv_append(&argc, &argv, value);
 668                 }
 669                 free(param);
 670             }
 671         }
 672     }
 673 
 674     
 675     mca_base_cmd_line_wrap_args(argv);
 676 
 677     value = opal_argv_join(argv, ' ');
 678     if (sysconf(_SC_ARG_MAX) < (int)strlen(value)) {
 679         orte_show_help("help-plm-rsh.txt", "cmd-line-too-long",
 680                        true, strlen(value), sysconf(_SC_ARG_MAX));
 681         free(value);
 682         return ORTE_ERR_SILENT;
 683     }
 684     free(value);
 685 
 686     if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
 687         ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
 688         opal_argv_append(&argc, &argv, ")");
 689     }
 690 
 691     if (0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
 692         param = opal_argv_join(argv, ' ');
 693         opal_output(orte_plm_base_framework.framework_output,
 694                     "%s plm:rsh: final template argv:\n\t%s",
 695                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 696                     (NULL == param) ? "NULL" : param);
 697         if (NULL != param) free(param);
 698     }
 699 
 700     
 701     *argcptr = argc;
 702     *argvptr = argv;
 703     return ORTE_SUCCESS;
 704 }
 705 
 706 
 707 static void ssh_child(int argc, char **argv)
 708 {
 709     char** env;
 710     char* var;
 711     long fd, fdmax = sysconf(_SC_OPEN_MAX);
 712     char *exec_path;
 713     char **exec_argv;
 714     int fdin;
 715     sigset_t sigs;
 716 
 717     
 718     env = opal_argv_copy(orte_launch_environ);
 719 
 720     
 721 
 722 
 723 
 724 
 725 
 726     
 727 
 728 
 729 
 730 
 731     exec_argv = argv;
 732     exec_path = strdup(rsh_agent_path);
 733 
 734     
 735     fdin = open("/dev/null", O_RDWR);
 736     dup2(fdin, 0);
 737     close(fdin);
 738 
 739     
 740     for(fd=3; fd<fdmax; fd++)
 741         close(fd);
 742 
 743     
 744 
 745 
 746 
 747 
 748 
 749 
 750     set_handler_default(SIGTERM);
 751     set_handler_default(SIGINT);
 752     set_handler_default(SIGHUP);
 753     set_handler_default(SIGPIPE);
 754     set_handler_default(SIGCHLD);
 755 
 756     
 757 
 758 
 759 
 760 
 761 
 762 
 763     sigprocmask(0, 0, &sigs);
 764     sigprocmask(SIG_UNBLOCK, &sigs, 0);
 765 
 766     
 767     var = opal_argv_join(argv, ' ');
 768     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 769                          "%s plm:rsh: executing: (%s) [%s]",
 770                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 771                          exec_path, (NULL == var) ? "NULL" : var));
 772     if (NULL != var) free(var);
 773 
 774     execve(exec_path, exec_argv, env);
 775     opal_output(0, "plm:rsh: execv of %s failed with errno=%s(%d)\n",
 776                 exec_path, strerror(errno), errno);
 777     exit(-1);
 778 }
 779 
 780 
 781 
 782 
 783 static int remote_spawn(void)
 784 {
 785     int node_name_index1;
 786     int proc_vpid_index;
 787     char **argv = NULL;
 788     char *prefix, *hostname, *var;
 789     int argc;
 790     int rc=ORTE_SUCCESS;
 791     bool failed_launch = true;
 792     orte_process_name_t target;
 793     orte_plm_rsh_caddy_t *caddy;
 794     orte_job_t *daemons;
 795     opal_list_t coll;
 796     orte_namelist_t *child;
 797 
 798     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 799                          "%s plm:rsh: remote spawn called",
 800                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 801 
 802     
 803     target.vpid = ORTE_PROC_MY_NAME->vpid;
 804 
 805     
 806 
 807 
 808     if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
 809         prefix = strdup(opal_install_dirs.prefix);
 810     } else {
 811         prefix = NULL;
 812     }
 813 
 814     
 815     OBJ_CONSTRUCT(&coll, opal_list_t);
 816     orte_routed.get_routing_list(&coll);
 817 
 818     
 819     if (0 == opal_list_get_size(&coll)) {
 820         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 821                              "%s plm:rsh: remote spawn - have no children!",
 822                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 823         failed_launch = false;
 824         rc = ORTE_SUCCESS;
 825         OBJ_DESTRUCT(&coll);
 826         goto cleanup;
 827     }
 828 
 829     
 830     if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv,
 831                                            orte_process_info.nodename, &node_name_index1,
 832                                            &proc_vpid_index, prefix))) {
 833         ORTE_ERROR_LOG(rc);
 834         OBJ_DESTRUCT(&coll);
 835         goto cleanup;
 836     }
 837 
 838     
 839     if (NULL == (daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
 840         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 841         rc = ORTE_ERR_NOT_FOUND;
 842         OBJ_DESTRUCT(&coll);
 843         goto cleanup;
 844     }
 845 
 846     target.jobid = ORTE_PROC_MY_NAME->jobid;
 847     OPAL_LIST_FOREACH(child, &coll, orte_namelist_t) {
 848         target.vpid = child->name.vpid;
 849 
 850         
 851         if (NULL == (hostname = orte_get_proc_hostname(&target))) {
 852             opal_output(0, "%s unable to get hostname for daemon %s",
 853                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(child->name.vpid));
 854             rc = ORTE_ERR_NOT_FOUND;
 855             OBJ_DESTRUCT(&coll);
 856             goto cleanup;
 857         }
 858 
 859         free(argv[node_name_index1]);
 860         argv[node_name_index1] = strdup(hostname);
 861 
 862         
 863         rc = orte_util_convert_vpid_to_string(&var, target.vpid);
 864         if (ORTE_SUCCESS != rc) {
 865             opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string");
 866             exit(-1);
 867         }
 868         free(argv[proc_vpid_index]);
 869         argv[proc_vpid_index] = strdup(var);
 870         free(var);
 871 
 872         
 873         caddy = OBJ_NEW(orte_plm_rsh_caddy_t);
 874         caddy->argc = argc;
 875         caddy->argv = opal_argv_copy(argv);
 876         
 877 
 878 
 879         caddy->daemon = OBJ_NEW(orte_proc_t);
 880         caddy->daemon->name.jobid = ORTE_PROC_MY_NAME->jobid;
 881         caddy->daemon->name.vpid = target.vpid;
 882         opal_list_append(&launch_list, &caddy->super);
 883     }
 884     OPAL_LIST_DESTRUCT(&coll);
 885     
 886 
 887 
 888     mca_plm_rsh_component.no_tree_spawn = true;
 889 
 890     
 891     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
 892                          "%s plm:rsh: activating launch event",
 893                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 894     opal_event_active(&launch_event, EV_WRITE, 1);
 895 
 896     
 897     failed_launch = false;
 898 
 899 cleanup:
 900     if (NULL != argv) {
 901         opal_argv_free(argv);
 902     }
 903 
 904     
 905     if (failed_launch) {
 906         
 907         opal_buffer_t *buf;
 908         buf = OBJ_NEW(opal_buffer_t);
 909         opal_dss.pack(buf, &target.vpid, 1, ORTE_VPID);
 910         opal_dss.pack(buf, &rc, 1, OPAL_INT);
 911         orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf,
 912                                 ORTE_RML_TAG_REPORT_REMOTE_LAUNCH,
 913                                 orte_rml_send_callback, NULL);
 914     }
 915 
 916     return rc;
 917 }
 918 
 919 
 920 
 921 
 922 
 923 
 924 static int rsh_launch(orte_job_t *jdata)
 925 {
 926     if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
 927         
 928         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_MAP);
 929     } else {
 930         
 931         ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_INIT);
 932     }
 933     return ORTE_SUCCESS;
 934 }
 935 
 936 static void process_launch_list(int fd, short args, void *cbdata)
 937 {
 938     opal_list_item_t *item;
 939     pid_t pid;
 940     orte_plm_rsh_caddy_t *caddy;
 941 
 942     ORTE_ACQUIRE_OBJECT(caddy);
 943 
 944     while (num_in_progress < mca_plm_rsh_component.num_concurrent) {
 945         item = opal_list_remove_first(&launch_list);
 946         if (NULL == item) {
 947             
 948             break;
 949         }
 950         caddy = (orte_plm_rsh_caddy_t*)item;
 951         
 952         ORTE_FLAG_SET(caddy->daemon, ORTE_PROC_FLAG_ALIVE);
 953         orte_wait_cb(caddy->daemon, rsh_wait_daemon, orte_event_base, (void*)caddy);
 954 
 955         
 956         pid = fork();
 957         if (pid < 0) {
 958             ORTE_ERROR_LOG(ORTE_ERR_SYS_LIMITS_CHILDREN);
 959             orte_wait_cb_cancel(caddy->daemon);
 960             continue;
 961         }
 962 
 963         
 964         if (pid == 0) {
 965             
 966 
 967 
 968 
 969 
 970 
 971 
 972 
 973 
 974 
 975 
 976 
 977 
 978 
 979 
 980 
 981 
 982 #if HAVE_SETPGID
 983             if( 0 != setpgid(0, 0) ) {
 984                 opal_output(0, "plm:rsh: Error: setpgid(0,0) failed in child with errno=%s(%d)\n",
 985                             strerror(errno), errno);
 986                 exit(-1);
 987             }
 988 #endif
 989 
 990             
 991             ssh_child(caddy->argc, caddy->argv);
 992         } else { 
 993             
 994             
 995 #if HAVE_SETPGID
 996             if( 0 != setpgid(pid, pid) ) {
 997                 opal_output(0, "plm:rsh: Warning: setpgid(%ld,%ld) failed in parent with errno=%s(%d)\n",
 998                             (long)pid, (long)pid, strerror(errno), errno);
 999                 
1000                 
1001             }
1002 #endif
1003 
1004             
1005             caddy->daemon->state = ORTE_PROC_STATE_RUNNING;
1006             
1007             caddy->daemon->pid = pid;
1008 
1009             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1010                                  "%s plm:rsh: recording launch of daemon %s",
1011                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1012                                  ORTE_NAME_PRINT(&(caddy->daemon->name))));
1013             num_in_progress++;
1014         }
1015     }
1016 }
1017 
1018 static void launch_daemons(int fd, short args, void *cbdata)
1019 {
1020     orte_job_map_t *map = NULL;
1021     int node_name_index1;
1022     int proc_vpid_index;
1023     char **argv = NULL;
1024     char *prefix_dir=NULL, *var;
1025     int argc;
1026     int rc;
1027     orte_app_context_t *app;
1028     orte_node_t *node, *nd;
1029     orte_std_cntr_t nnode;
1030     orte_job_t *daemons;
1031     orte_state_caddy_t *state = (orte_state_caddy_t*)cbdata;
1032     orte_plm_rsh_caddy_t *caddy;
1033     opal_list_t coll;
1034     char *username;
1035     int port, *portptr;
1036     orte_namelist_t *child;
1037 
1038     ORTE_ACQUIRE_OBJECT(state);
1039 
1040     
1041 
1042 
1043     if (ORTE_FLAG_TEST(state->jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
1044         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1045         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1046         OBJ_RELEASE(state);
1047         return;
1048     }
1049 
1050     
1051     daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
1052     if (ORTE_SUCCESS != (rc = orte_plm_base_setup_virtual_machine(state->jdata))) {
1053         ORTE_ERROR_LOG(rc);
1054         goto cleanup;
1055     }
1056 
1057     
1058 
1059 
1060 
1061     if (orte_do_not_launch) {
1062         
1063 
1064 
1065 
1066         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1067         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1068         OBJ_RELEASE(state);
1069         return;
1070     }
1071 
1072     
1073     if (NULL == (map = daemons->map)) {
1074         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1075         rc = ORTE_ERR_NOT_FOUND;
1076         goto cleanup;
1077     }
1078 
1079     if (0 == map->num_new_daemons) {
1080         
1081 
1082 
1083 
1084         state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1085         ORTE_ACTIVATE_JOB_STATE(state->jdata, ORTE_JOB_STATE_DAEMONS_REPORTED);
1086         OBJ_RELEASE(state);
1087         return;
1088     }
1089 
1090     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1091                          "%s plm:rsh: launching vm",
1092                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1093 
1094     if ((0 < opal_output_get_verbosity(orte_plm_base_framework.framework_output) ||
1095          orte_leave_session_attached) &&
1096         mca_plm_rsh_component.num_concurrent < map->num_new_daemons) {
1097         
1098 
1099 
1100 
1101 
1102 
1103 
1104 
1105 
1106 
1107 
1108 
1109 
1110         orte_show_help("help-plm-rsh.txt", "deadlock-params",
1111                        true, mca_plm_rsh_component.num_concurrent, map->num_new_daemons);
1112         ORTE_ERROR_LOG(ORTE_ERR_FATAL);
1113         OBJ_RELEASE(state);
1114         rc = ORTE_ERR_SILENT;
1115         goto cleanup;
1116     }
1117 
1118     
1119 
1120 
1121 
1122 
1123 
1124 
1125 
1126 
1127 
1128 
1129 
1130 
1131 
1132 
1133 
1134 
1135     app = (orte_app_context_t*)opal_pointer_array_get_item(state->jdata->apps, 0);
1136     if (!orte_get_attribute(&app->attributes, ORTE_APP_PREFIX_DIR, (void**)&prefix_dir, OPAL_STRING)) {
1137         
1138 
1139 
1140         if ((bool)ORTE_WANT_ORTERUN_PREFIX_BY_DEFAULT) {
1141             prefix_dir = strdup(opal_install_dirs.prefix);
1142         }
1143     }
1144     
1145 
1146 
1147     node = NULL;
1148     for (nnode = 0; nnode < map->nodes->size; nnode++) {
1149         if (NULL != (nd = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
1150             node = nd;
1151             
1152 
1153 
1154 
1155             if (0 != strcmp(node->name, orte_process_info.nodename)) {
1156                 break;
1157             }
1158         }
1159     }
1160     if (NULL == node) {
1161         
1162 
1163         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1164         rc = ORTE_ERR_NOT_FOUND;
1165         goto cleanup;
1166     }
1167 
1168     
1169     if (!mca_plm_rsh_component.no_tree_spawn) {
1170         orte_job_t *jdatorted;
1171 
1172         
1173         if (NULL == (jdatorted = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
1174             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1175             rc = ORTE_ERR_NOT_FOUND;
1176             goto cleanup;
1177         }
1178 
1179         
1180         OBJ_CONSTRUCT(&coll, opal_list_t);
1181         orte_routed.get_routing_list(&coll);
1182     }
1183 
1184     
1185     if (ORTE_SUCCESS != (rc = setup_launch(&argc, &argv, node->name, &node_name_index1,
1186                                            &proc_vpid_index, prefix_dir))) {
1187         ORTE_ERROR_LOG(rc);
1188         goto cleanup;
1189     }
1190 
1191     
1192 
1193 
1194     for (nnode=0; nnode < map->nodes->size; nnode++) {
1195         if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, nnode))) {
1196             continue;
1197         }
1198 
1199         
1200         if (!mca_plm_rsh_component.no_tree_spawn) {
1201             OPAL_LIST_FOREACH(child, &coll, orte_namelist_t) {
1202                 if (child->name.vpid == node->daemon->name.vpid) {
1203                     goto launch;
1204                 }
1205             }
1206             
1207             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1208                                  "%s plm:rsh:launch daemon %s not a child of mine",
1209                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1210                                  ORTE_VPID_PRINT(node->daemon->name.vpid)));
1211             continue;
1212         }
1213 
1214     launch:
1215         
1216         if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_DAEMON_LAUNCHED)) {
1217             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1218                                  "%s plm:rsh:launch daemon already exists on node %s",
1219                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1220                                  node->name));
1221             continue;
1222         }
1223 
1224         
1225 
1226 
1227         if (NULL == node->daemon) {
1228             ORTE_ERROR_LOG(ORTE_ERR_FATAL);
1229             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1230                                  "%s plm:rsh:launch daemon failed to be defined on node %s",
1231                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1232                                  node->name));
1233             continue;
1234         }
1235 
1236         
1237         free(argv[node_name_index1]);
1238         username = NULL;
1239         if (orte_get_attribute(&node->attributes, ORTE_NODE_USERNAME, (void**)&username, OPAL_STRING)) {
1240             opal_asprintf (&argv[node_name_index1], "%s@%s",
1241                             username, node->name);
1242             free(username);
1243         } else {
1244             argv[node_name_index1] = strdup(node->name);
1245         }
1246 
1247         
1248         rc = orte_util_convert_vpid_to_string(&var, node->daemon->name.vpid);
1249         if (ORTE_SUCCESS != rc) {
1250             opal_output(0, "orte_plm_rsh: unable to get daemon vpid as string");
1251             exit(-1);
1252         }
1253         free(argv[proc_vpid_index]);
1254         argv[proc_vpid_index] = strdup(var);
1255         free(var);
1256 
1257         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1258                              "%s plm:rsh: adding node %s to launch list",
1259                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1260                              node->name));
1261 
1262         
1263         caddy = OBJ_NEW(orte_plm_rsh_caddy_t);
1264         caddy->argc = argc;
1265         caddy->argv = opal_argv_copy(argv);
1266         
1267         portptr = &port;
1268         if (orte_get_attribute(&node->attributes, ORTE_NODE_PORT, (void**)&portptr, OPAL_INT)) {
1269             char portname[16];
1270             
1271             opal_argv_insert_element(&caddy->argv, node_name_index1+1, "-p");
1272             snprintf (portname, 15, "%d", port);
1273             opal_argv_insert_element(&caddy->argv, node_name_index1+2, portname);
1274         }
1275         caddy->daemon = node->daemon;
1276         OBJ_RETAIN(caddy->daemon);
1277         opal_list_append(&launch_list, &caddy->super);
1278     }
1279     
1280 
1281 
1282     mca_plm_rsh_component.no_tree_spawn = true;
1283 
1284     
1285     state->jdata->state = ORTE_JOB_STATE_DAEMONS_LAUNCHED;
1286 
1287     
1288     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1289                          "%s plm:rsh: activating launch event",
1290                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1291     ORTE_POST_OBJECT(state);
1292     opal_event_active(&launch_event, EV_WRITE, 1);
1293 
1294     
1295 
1296 
1297     OBJ_RELEASE(state);
1298     opal_argv_free(argv);
1299     return;
1300 
1301  cleanup:
1302     OBJ_RELEASE(state);
1303     ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
1304 }
1305 
1306 
1307 
1308 
1309 static int rsh_terminate_orteds(void)
1310 {
1311     int rc;
1312 
1313     if (ORTE_SUCCESS != (rc = orte_plm_base_orted_exit(ORTE_DAEMON_EXIT_CMD))) {
1314         ORTE_ERROR_LOG(rc);
1315     }
1316 
1317     return rc;
1318 }
1319 
1320 static int rsh_finalize(void)
1321 {
1322     int rc, i;
1323     orte_job_t *jdata;
1324     orte_proc_t *proc;
1325     pid_t ret;
1326 
1327     
1328     opal_event_del(&launch_event);
1329     OPAL_LIST_DESTRUCT(&launch_list);
1330 
1331     
1332     if (ORTE_SUCCESS != (rc = orte_plm_base_comm_stop())) {
1333         ORTE_ERROR_LOG(rc);
1334     }
1335 
1336     if ((ORTE_PROC_IS_DAEMON || ORTE_PROC_IS_HNP) && orte_abnormal_term_ordered) {
1337         
1338         if (NULL == (jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
1339             return rc;
1340         }
1341         for (i=0; i < jdata->procs->size; i++) {
1342             if (NULL == (proc = opal_pointer_array_get_item(jdata->procs, i))) {
1343                 continue;
1344             }
1345             if (0 < proc->pid) {
1346                 
1347                 ret = waitpid(proc->pid, &proc->exit_code, WNOHANG);
1348                 if (-1 == ret && ECHILD == errno) {
1349                     
1350 
1351                     continue;
1352                 }
1353                 if (ret == proc->pid) {
1354                     
1355                     continue;
1356                 }
1357                 
1358                 kill(proc->pid, SIGKILL);
1359             }
1360         }
1361     }
1362     free(mca_plm_rsh_component.agent_path);
1363     free(rsh_agent_path);
1364     opal_argv_free(mca_plm_rsh_component.agent_argv);
1365     opal_argv_free(rsh_agent_argv);
1366 
1367     return rc;
1368 }
1369 
1370 
1371 static void set_handler_default(int sig)
1372 {
1373     struct sigaction act;
1374 
1375     act.sa_handler = SIG_DFL;
1376     act.sa_flags = 0;
1377     sigemptyset(&act.sa_mask);
1378 
1379     sigaction(sig, &act, (struct sigaction *)0);
1380 }
1381 
1382 
1383 static orte_plm_rsh_shell_t find_shell(char *shell)
1384 {
1385     int i         = 0;
1386     char *sh_name = NULL;
1387 
1388     if( (NULL == shell) || (strlen(shell) == 1) ) {
1389         
1390         return ORTE_PLM_RSH_SHELL_UNKNOWN;
1391     }
1392 
1393     sh_name = rindex(shell, '/');
1394     if( NULL == sh_name ) {
1395         
1396         return ORTE_PLM_RSH_SHELL_UNKNOWN;
1397     }
1398 
1399     
1400     ++sh_name;
1401     for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name) /
1402                           sizeof(orte_plm_rsh_shell_name[0])); ++i) {
1403         if (NULL != strstr(sh_name, orte_plm_rsh_shell_name[i])) {
1404             return (orte_plm_rsh_shell_t)i;
1405         }
1406     }
1407 
1408     
1409     return ORTE_PLM_RSH_SHELL_UNKNOWN;
1410 }
1411 
1412 static int launch_agent_setup(const char *agent, char *path)
1413 {
1414     char *bname;
1415     int i;
1416 
1417     
1418     if (NULL == mca_plm_rsh_component.agent && NULL == agent) {
1419         return ORTE_ERR_NOT_FOUND;
1420     }
1421 
1422     
1423     OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output,
1424                          "%s plm:rsh_setup on agent %s path %s",
1425                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1426                          (NULL == agent) ? mca_plm_rsh_component.agent : agent,
1427                          (NULL == path) ? "NULL" : path));
1428     rsh_agent_argv = orte_plm_rsh_search(agent, path);
1429 
1430     if (0 == opal_argv_count(rsh_agent_argv)) {
1431         
1432         return ORTE_ERR_NOT_FOUND;
1433     }
1434 
1435     
1436     rsh_agent_path = opal_path_findv(rsh_agent_argv[0], X_OK, environ, path);
1437 
1438     if (NULL == rsh_agent_path) {
1439         
1440         opal_argv_free(rsh_agent_argv);
1441         return ORTE_ERR_NOT_FOUND;
1442     }
1443 
1444     bname = opal_basename(rsh_agent_argv[0]);
1445     if (NULL != bname && 0 == strcmp(bname, "ssh")) {
1446         
1447         if (NULL != orte_xterm) {
1448             opal_argv_append_unique_nosize(&rsh_agent_argv, "-X", false);
1449         } else if (0 >= opal_output_get_verbosity(orte_plm_base_framework.framework_output)) {
1450             
1451 
1452 
1453 
1454             for (i = 1; NULL != rsh_agent_argv[i]; ++i) {
1455                 if (0 == strcasecmp("-x", rsh_agent_argv[i])) {
1456                     break;
1457                 }
1458             }
1459             if (NULL == rsh_agent_argv[i]) {
1460                 opal_argv_append_nosize(&rsh_agent_argv, "-x");
1461             }
1462         }
1463     }
1464     if (NULL != bname) {
1465         free(bname);
1466     }
1467 
1468     
1469     return ORTE_SUCCESS;
1470 }
1471 
1472 
1473 
1474 
1475 static int rsh_probe(char *nodename,
1476                      orte_plm_rsh_shell_t *shell)
1477 {
1478     char ** argv;
1479     int argc, rc = ORTE_SUCCESS, i;
1480     int fd[2];
1481     pid_t pid;
1482     char outbuf[4096];
1483 
1484     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1485                          "%s plm:rsh: going to check SHELL variable on node %s",
1486                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1487                          nodename));
1488 
1489     *shell = ORTE_PLM_RSH_SHELL_UNKNOWN;
1490     if (pipe(fd)) {
1491         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1492                              "%s plm:rsh: pipe failed with errno=%d",
1493                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1494                              errno));
1495         return ORTE_ERR_IN_ERRNO;
1496     }
1497     if ((pid = fork()) < 0) {
1498         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1499                              "%s plm:rsh: fork failed with errno=%d",
1500                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1501                              errno));
1502         return ORTE_ERR_IN_ERRNO;
1503     }
1504     else if (pid == 0) {          
1505         if (dup2(fd[1], 1) < 0) {
1506             OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1507                                  "%s plm:rsh: dup2 failed with errno=%d",
1508                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1509                                  errno));
1510             exit(01);
1511         }
1512         
1513         argv = opal_argv_copy(mca_plm_rsh_component.agent_argv);
1514         argc = opal_argv_count(mca_plm_rsh_component.agent_argv);
1515         opal_argv_append(&argc, &argv, nodename);
1516         opal_argv_append(&argc, &argv, "echo $SHELL");
1517 
1518         execvp(argv[0], argv);
1519         exit(errno);
1520     }
1521     if (close(fd[1])) {
1522         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1523                              "%s plm:rsh: close failed with errno=%d",
1524                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1525                              errno));
1526         return ORTE_ERR_IN_ERRNO;
1527     }
1528 
1529     {
1530         ssize_t ret = 1;
1531         char* ptr = outbuf;
1532         size_t outbufsize = sizeof(outbuf);
1533 
1534         do {
1535             ret = read (fd[0], ptr, outbufsize-1);
1536             if (ret < 0) {
1537                 if (errno == EINTR)
1538                     continue;
1539                 OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1540                                      "%s plm:rsh: Unable to detect the remote shell (error %s)",
1541                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1542                                      strerror(errno)));
1543                 rc = ORTE_ERR_IN_ERRNO;
1544                 break;
1545             }
1546             if( outbufsize > 1 ) {
1547                 outbufsize -= ret;
1548                 ptr += ret;
1549             }
1550         } while( 0 != ret );
1551         *ptr = '\0';
1552     }
1553     close(fd[0]);
1554 
1555     if( outbuf[0] != '\0' ) {
1556         char *sh_name = rindex(outbuf, '/');
1557         if( NULL != sh_name ) {
1558             sh_name++; 
1559             
1560             for (i = 0; i < (int)(sizeof (orte_plm_rsh_shell_name)/
1561                                   sizeof(orte_plm_rsh_shell_name[0])); i++) {
1562                 if ( NULL != strstr(sh_name, orte_plm_rsh_shell_name[i]) ) {
1563                     *shell = (orte_plm_rsh_shell_t)i;
1564                     break;
1565                 }
1566             }
1567         }
1568     }
1569 
1570     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1571                          "%s plm:rsh: node %s has SHELL: %s",
1572                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1573                          nodename,
1574                          (ORTE_PLM_RSH_SHELL_UNKNOWN == *shell) ? "UNHANDLED" : (char*)orte_plm_rsh_shell_name[*shell]));
1575 
1576     return rc;
1577 }
1578 
1579 static int setup_shell(orte_plm_rsh_shell_t *rshell,
1580                        orte_plm_rsh_shell_t *lshell,
1581                        char *nodename, int *argc, char ***argv)
1582 {
1583     orte_plm_rsh_shell_t remote_shell, local_shell;
1584     char *param;
1585     int rc;
1586 
1587     
1588     local_shell = ORTE_PLM_RSH_SHELL_UNKNOWN;
1589 
1590 #if OPAL_ENABLE_GETPWUID
1591     {
1592         struct passwd *p;
1593 
1594         p = getpwuid(getuid());
1595         if( NULL != p ) {
1596             param = p->pw_shell;
1597             local_shell = find_shell(p->pw_shell);
1598         }
1599     }
1600 #endif
1601 
1602     
1603 
1604 
1605     if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell &&
1606         NULL != (param = getenv("SHELL"))) {
1607         local_shell = find_shell(param);
1608     }
1609 
1610     if (ORTE_PLM_RSH_SHELL_UNKNOWN == local_shell) {
1611         opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n",
1612                     (NULL != param) ? param : "unknown");
1613         local_shell = ORTE_PLM_RSH_SHELL_BASH;
1614     }
1615 
1616     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1617                          "%s plm:rsh: local shell: %d (%s)",
1618                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1619                          local_shell, orte_plm_rsh_shell_name[local_shell]));
1620 
1621     
1622     if (mca_plm_rsh_component.assume_same_shell) {
1623         remote_shell = local_shell;
1624         OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1625                              "%s plm:rsh: assuming same remote shell as local shell",
1626                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1627     } else {
1628         rc = rsh_probe(nodename, &remote_shell);
1629 
1630         if (ORTE_SUCCESS != rc) {
1631             ORTE_ERROR_LOG(rc);
1632             return rc;
1633         }
1634 
1635         if (ORTE_PLM_RSH_SHELL_UNKNOWN == remote_shell) {
1636             opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n");
1637             remote_shell = ORTE_PLM_RSH_SHELL_BASH;
1638         }
1639     }
1640 
1641     OPAL_OUTPUT_VERBOSE((1, orte_plm_base_framework.framework_output,
1642                          "%s plm:rsh: remote shell: %d (%s)",
1643                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1644                          remote_shell, orte_plm_rsh_shell_name[remote_shell]));
1645 
1646     
1647 
1648 
1649 
1650 
1651 
1652 
1653 
1654     if (ORTE_PLM_RSH_SHELL_SH == remote_shell ||
1655         ORTE_PLM_RSH_SHELL_KSH == remote_shell) {
1656         int i;
1657         char **tmp;
1658         tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' ');
1659         if (NULL == tmp) {
1660             return ORTE_ERR_OUT_OF_RESOURCE;
1661         }
1662         for (i = 0; NULL != tmp[i]; ++i) {
1663             opal_argv_append(argc, argv, tmp[i]);
1664         }
1665         opal_argv_free(tmp);
1666     }
1667 
1668     
1669     *rshell = remote_shell;
1670     *lshell = local_shell;
1671 
1672     return ORTE_SUCCESS;
1673 }