This source file includes following definitions.
- setup_cbfunc
- orte_odls_base_default_get_add_procs_data
- ls_cbunc
- orte_odls_base_default_construct_child_list
- setup_path
- timer_cb
- compute_num_procs_alive
- orte_odls_base_spawn_proc
- orte_odls_base_default_launch_local
- orte_odls_base_default_signal_local_procs
- orte_odls_base_default_wait_local_proc
- qcdcon
- qcddes
- orte_odls_base_default_kill_local_procs
- orte_odls_base_get_proc_stats
- orte_odls_base_default_restart_proc
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 
  28 
  29 
  30 #include "orte_config.h"
  31 #include "orte/constants.h"
  32 #include "orte/types.h"
  33 
  34 #ifdef HAVE_SYS_WAIT_H
  35 #include <sys/wait.h>
  36 #endif
  37 #include <errno.h>
  38 #ifdef HAVE_SYS_STAT_H
  39 #include <sys/stat.h>
  40 #endif  
  41 #ifdef HAVE_SYS_PARAM_H
  42 #include <sys/param.h>
  43 #endif
  44 #include <time.h>
  45 
  46 #include <signal.h>
  47 
  48 #include "opal_stdint.h"
  49 #include "opal/util/opal_environ.h"
  50 #include "opal/util/argv.h"
  51 #include "opal/util/os_dirpath.h"
  52 #include "opal/util/os_path.h"
  53 #include "opal/util/path.h"
  54 #include "opal/util/printf.h"
  55 #include "opal/util/sys_limits.h"
  56 #include "opal/dss/dss.h"
  57 #include "opal/mca/hwloc/hwloc-internal.h"
  58 #include "opal/mca/shmem/base/base.h"
  59 #include "opal/mca/pstat/pstat.h"
  60 #include "opal/mca/pmix/base/base.h"
  61 
  62 #include "orte/mca/errmgr/errmgr.h"
  63 #include "orte/mca/rml/rml.h"
  64 #include "orte/mca/routed/routed.h"
  65 #include "orte/mca/iof/iof.h"
  66 #include "orte/mca/iof/base/iof_base_setup.h"
  67 #include "orte/mca/ess/base/base.h"
  68 #include "orte/mca/grpcomm/base/base.h"
  69 #include "orte/mca/plm/base/base.h"
  70 #include "orte/mca/rml/base/rml_contact.h"
  71 #include "orte/mca/rmaps/rmaps_types.h"
  72 #include "orte/mca/rmaps/base/base.h"
  73 #include "orte/mca/rmaps/base/rmaps_private.h"
  74 #include "orte/mca/rtc/rtc.h"
  75 #include "orte/mca/schizo/schizo.h"
  76 #include "orte/mca/state/state.h"
  77 #include "orte/mca/filem/filem.h"
  78 
  79 #include "orte/util/context_fns.h"
  80 #include "orte/util/name_fns.h"
  81 #include "orte/util/nidmap.h"
  82 #include "orte/util/session_dir.h"
  83 #include "orte/util/proc_info.h"
  84 #include "orte/util/show_help.h"
  85 #include "orte/util/threads.h"
  86 #include "orte/runtime/orte_globals.h"
  87 #include "orte/runtime/orte_wait.h"
  88 #include "orte/orted/orted.h"
  89 #include "orte/orted/pmix/pmix_server.h"
  90 
  91 #if OPAL_ENABLE_FT_CR == 1
  92 #include "orte/mca/snapc/snapc.h"
  93 #include "orte/mca/snapc/base/base.h"
  94 #include "orte/mca/sstore/sstore.h"
  95 #include "orte/mca/sstore/base/base.h"
  96 #include "opal/mca/crs/crs.h"
  97 #include "opal/mca/crs/base/base.h"
  98 #endif
  99 
 100 #include "orte/mca/odls/base/base.h"
 101 #include "orte/mca/odls/base/odls_private.h"
 102 
 103 static void setup_cbfunc(int status,
 104                          opal_list_t *info,
 105                          void *provided_cbdata,
 106                          opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
 107 {
 108     orte_job_t *jdata = (orte_job_t*)provided_cbdata;
 109     opal_value_t *kv;
 110     opal_buffer_t cache, *bptr;
 111     int rc = ORTE_SUCCESS;
 112 
 113     OBJ_CONSTRUCT(&cache, opal_buffer_t);
 114     if (NULL != info) {
 115         
 116         OPAL_LIST_FOREACH(kv, info, opal_value_t) {
 117             if (OPAL_SUCCESS != (rc = opal_dss.pack(&cache, &kv, 1, OPAL_VALUE))) {
 118                 ORTE_ERROR_LOG(rc);
 119             }
 120         }
 121     }
 122     
 123     bptr = &cache;
 124     opal_dss.pack(&jdata->launch_msg, &bptr, 1, OPAL_BUFFER);
 125     OBJ_DESTRUCT(&cache);
 126 
 127     
 128     if (NULL != cbfunc) {
 129         cbfunc(rc, cbdata);
 130     }
 131 
 132     
 133     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SEND_LAUNCH_MSG);
 134 
 135 }
 136 
 137 
 138 
 139 int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
 140                                               orte_jobid_t job)
 141 {
 142     int rc, v;
 143     orte_job_t *jdata=NULL, *jptr;
 144     orte_job_map_t *map=NULL;
 145     opal_buffer_t *wireup, jobdata, priorjob;
 146     opal_byte_object_t bo, *boptr;
 147     int32_t numbytes;
 148     int8_t flag;
 149     void *nptr;
 150     uint32_t key;
 151     orte_proc_t *dmn, *proc;
 152     opal_value_t *val = NULL, *kv;
 153     opal_list_t *modex, ilist;
 154     int n;
 155 
 156     
 157     if (NULL == (jdata = orte_get_job_data_object(job))) {
 158         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 159         return ORTE_ERR_BAD_PARAM;
 160     }
 161 
 162     
 163     map = jdata->map;
 164     
 165     if (NULL == map) {
 166         return ORTE_SUCCESS;
 167     }
 168 
 169     
 170 
 171 
 172     if (1 < orte_process_info.num_procs &&
 173         (!orte_node_info_communicated ||
 174          orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL))) {
 175         
 176         flag = 1;
 177         opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
 178         
 179         if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, buffer))) {
 180             ORTE_ERROR_LOG(rc);
 181             return rc;
 182         }
 183 
 184         
 185         if (NULL == (jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
 186             ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 187             return ORTE_ERR_BAD_PARAM;
 188         }
 189         wireup = OBJ_NEW(opal_buffer_t);
 190         
 191         val = NULL;
 192         if (opal_pmix.legacy_get()) {
 193             if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
 194                 ORTE_ERROR_LOG(rc);
 195                 OBJ_RELEASE(wireup);
 196                 return rc;
 197             } else {
 198                 
 199                 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
 200                     ORTE_ERROR_LOG(rc);
 201                     OBJ_RELEASE(wireup);
 202                     return rc;
 203                 }
 204                 
 205                if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
 206                     ORTE_ERROR_LOG(rc);
 207                     OBJ_RELEASE(wireup);
 208                     return rc;
 209                 }
 210                 OBJ_RELEASE(val);
 211             }
 212         } else {
 213             if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
 214                 ORTE_ERROR_LOG(rc);
 215                 OBJ_RELEASE(wireup);
 216                 return rc;
 217             }
 218             
 219             if (OPAL_PTR != val->type) {
 220                 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 221                 OBJ_RELEASE(wireup);
 222                 return ORTE_ERR_NOT_FOUND;
 223             }
 224             if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
 225                 ORTE_ERROR_LOG(rc);
 226                 OBJ_RELEASE(wireup);
 227                 return rc;
 228             }
 229             modex = (opal_list_t*)val->data.ptr;
 230             numbytes = (int32_t)opal_list_get_size(modex);
 231             if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
 232                 ORTE_ERROR_LOG(rc);
 233                 OBJ_RELEASE(wireup);
 234                 return rc;
 235             }
 236             OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
 237                 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
 238                     ORTE_ERROR_LOG(rc);
 239                     OBJ_RELEASE(wireup);
 240                     return rc;
 241                 }
 242             }
 243             OPAL_LIST_RELEASE(modex);
 244             OBJ_RELEASE(val);
 245         }
 246         
 247         for (v=1; v < jptr->procs->size; v++) {
 248             if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
 249                 continue;
 250             }
 251             val = NULL;
 252             if (opal_pmix.legacy_get()) {
 253                 if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
 254                     ORTE_ERROR_LOG(rc);
 255                     OBJ_RELEASE(buffer);
 256                     OBJ_RELEASE(wireup);
 257                     return rc;
 258                 } else {
 259                     
 260                     if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
 261                         ORTE_ERROR_LOG(rc);
 262                         OBJ_RELEASE(buffer);
 263                         OBJ_RELEASE(wireup);
 264                         return rc;
 265                     }
 266                     
 267                    if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
 268                         ORTE_ERROR_LOG(rc);
 269                         OBJ_RELEASE(buffer);
 270                         OBJ_RELEASE(wireup);
 271                         return rc;
 272                     }
 273                     OBJ_RELEASE(val);
 274                 }
 275             } else {
 276                 if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
 277                     ORTE_ERROR_LOG(rc);
 278                     OBJ_RELEASE(buffer);
 279                     return rc;
 280                 } else {
 281                     
 282                     if (OPAL_PTR != val->type) {
 283                         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 284                         OBJ_RELEASE(buffer);
 285                         return ORTE_ERR_NOT_FOUND;
 286                     }
 287                     if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
 288                         ORTE_ERROR_LOG(rc);
 289                         OBJ_RELEASE(buffer);
 290                         OBJ_RELEASE(wireup);
 291                         return rc;
 292                     }
 293                     modex = (opal_list_t*)val->data.ptr;
 294                     numbytes = (int32_t)opal_list_get_size(modex);
 295                     if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
 296                         ORTE_ERROR_LOG(rc);
 297                         OBJ_RELEASE(buffer);
 298                         OBJ_RELEASE(wireup);
 299                         return rc;
 300                     }
 301                     OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
 302                         if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
 303                             ORTE_ERROR_LOG(rc);
 304                             OBJ_RELEASE(buffer);
 305                             OBJ_RELEASE(wireup);
 306                             return rc;
 307                         }
 308                     }
 309                     OPAL_LIST_RELEASE(modex);
 310                     OBJ_RELEASE(val);
 311                 }
 312             }
 313         }
 314         
 315         opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
 316         OBJ_RELEASE(wireup);
 317         
 318         bo.size = numbytes;
 319         boptr = &bo;
 320         if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &boptr, 1, OPAL_BYTE_OBJECT))) {
 321             ORTE_ERROR_LOG(rc);
 322             return rc;
 323         }
 324         
 325         if (NULL != bo.bytes) {
 326             free(bo.bytes);
 327         }
 328 
 329         
 330 
 331 
 332 
 333         if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) {
 334             flag = 1;
 335             opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
 336             OBJ_CONSTRUCT(&jobdata, opal_buffer_t);
 337             rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr);
 338             while (OPAL_SUCCESS == rc) {
 339                 
 340                 if (NULL != jptr && jptr != jdata &&
 341                     ORTE_PROC_MY_NAME->jobid != jptr->jobid) {
 342                     OBJ_CONSTRUCT(&priorjob, opal_buffer_t);
 343                     
 344                     if (ORTE_SUCCESS != (rc = opal_dss.pack(&priorjob, &jptr, 1, ORTE_JOB))) {
 345                         ORTE_ERROR_LOG(rc);
 346                         OBJ_DESTRUCT(&jobdata);
 347                         OBJ_DESTRUCT(&priorjob);
 348                         return rc;
 349                     }
 350                     
 351                     for (n=0; n < jptr->procs->size; n++) {
 352                         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, n))) {
 353                             continue;
 354                         }
 355                         if (ORTE_SUCCESS != (rc = opal_dss.pack(&priorjob, &proc->parent, 1, ORTE_VPID))) {
 356                             ORTE_ERROR_LOG(rc);
 357                             OBJ_DESTRUCT(&jobdata);
 358                             OBJ_DESTRUCT(&priorjob);
 359                             return rc;
 360                         }
 361                     }
 362                     
 363                     wireup = &priorjob;
 364                     if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &wireup, 1, OPAL_BUFFER))) {
 365                         ORTE_ERROR_LOG(rc);
 366                         OBJ_DESTRUCT(&jobdata);
 367                         OBJ_DESTRUCT(&priorjob);
 368                         return rc;
 369                     }
 370                     OBJ_DESTRUCT(&priorjob);
 371                 }
 372                 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
 373             }
 374             
 375             wireup = &jobdata;
 376             if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &wireup, 1, OPAL_BUFFER))) {
 377                 ORTE_ERROR_LOG(rc);
 378                 OBJ_DESTRUCT(&jobdata);
 379                 return rc;
 380             }
 381             OBJ_DESTRUCT(&jobdata);
 382         } else {
 383             flag = 0;
 384             opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
 385         }
 386         orte_node_info_communicated = true;
 387     } else {
 388         
 389         flag = 0;
 390         opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
 391         
 392         flag = 0;
 393         opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
 394     }
 395 
 396     
 397     if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &jdata, 1, ORTE_JOB))) {
 398         ORTE_ERROR_LOG(rc);
 399         return rc;
 400     }
 401 
 402     if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
 403         
 404         if (ORTE_SUCCESS != (rc = orte_util_generate_ppn(jdata, buffer))) {
 405             ORTE_ERROR_LOG(rc);
 406             return rc;
 407         }
 408     }
 409 
 410     
 411     if (NULL != opal_pmix.server_setup_application) {
 412         OBJ_CONSTRUCT(&ilist, opal_list_t);
 413         
 414         kv = OBJ_NEW(opal_value_t);
 415         kv->key = strdup(OPAL_PMIX_ALLOC_NETWORK_ID);
 416         kv->type = OPAL_STRING;
 417         opal_asprintf(&kv->data.string, "%s.net", ORTE_JOBID_PRINT(jdata->jobid));
 418         opal_list_append(&ilist, &kv->super);
 419         
 420         kv = OBJ_NEW(opal_value_t);
 421         kv->key = strdup(OPAL_PMIX_ALLOC_NETWORK_SEC_KEY);
 422         kv->type = OPAL_BOOL;
 423         kv->data.flag = true;
 424         opal_list_append(&ilist, &kv->super);
 425         
 426         kv = OBJ_NEW(opal_value_t);
 427         kv->key = strdup(OPAL_PMIX_SETUP_APP_ENVARS);
 428         kv->type = OPAL_BOOL;
 429         kv->data.flag = true;
 430         opal_list_append(&ilist, &kv->super);
 431         
 432 
 433         if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_application(jdata->jobid, &ilist, setup_cbfunc, jdata))) {
 434             ORTE_ERROR_LOG(rc);
 435         }
 436         OPAL_LIST_DESTRUCT(&ilist);
 437         return rc;
 438     }
 439 
 440     
 441     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SEND_LAUNCH_MSG);
 442 
 443     return ORTE_SUCCESS;
 444 }
 445 
 446 static void ls_cbunc(int status, void *cbdata)
 447 {
 448     opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
 449     OPAL_PMIX_WAKEUP_THREAD(lock);
 450 }
 451 
 452 int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
 453                                                 orte_jobid_t *job)
 454 {
 455     int rc;
 456     orte_std_cntr_t cnt;
 457     orte_job_t *jdata=NULL, *daemons;
 458     orte_node_t *node;
 459     orte_vpid_t dmnvpid, v;
 460     int32_t n;
 461     opal_buffer_t *bptr, *jptr;
 462     orte_proc_t *pptr, *dmn;
 463     orte_app_context_t *app;
 464     int8_t flag;
 465     opal_value_t *kv;
 466     opal_list_t local_support, cache;
 467     opal_pmix_lock_t lock;
 468 
 469     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
 470                          "%s odls:constructing child list",
 471                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
 472 
 473     
 474     *job = ORTE_JOBID_INVALID;
 475     
 476     daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
 477     OPAL_PMIX_CONSTRUCT_LOCK(&lock);
 478     OBJ_CONSTRUCT(&local_support, opal_list_t);
 479 
 480     
 481     cnt=1;
 482     if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
 483         ORTE_ERROR_LOG(rc);
 484         goto REPORT_ERROR;
 485     }
 486 
 487     if (0 != flag) {
 488         
 489         cnt=1;
 490         if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER))) {
 491             *job = ORTE_JOBID_INVALID;
 492             ORTE_ERROR_LOG(rc);
 493             OBJ_RELEASE(bptr);
 494             goto REPORT_ERROR;
 495         }
 496         cnt=1;
 497         while (ORTE_SUCCESS == (rc = opal_dss.unpack(bptr, &jptr, &cnt, OPAL_BUFFER))) {
 498             
 499             cnt=1;
 500             if (ORTE_SUCCESS != (rc = opal_dss.unpack(jptr, &jdata, &cnt, ORTE_JOB))) {
 501                 *job = ORTE_JOBID_INVALID;
 502                 ORTE_ERROR_LOG(rc);
 503                 OBJ_RELEASE(bptr);
 504                 OBJ_RELEASE(jptr);
 505                 goto REPORT_ERROR;
 506             }
 507             
 508             if (NULL == orte_get_job_data_object(jdata->jobid)) {
 509                 
 510                 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
 511             } else {
 512                 
 513                 jdata->jobid = ORTE_JOBID_INVALID;
 514                 OBJ_RELEASE(jdata);
 515                 OBJ_RELEASE(jptr);
 516                 cnt=1;
 517                 continue;
 518             }
 519             
 520             for (v=0; v < jdata->num_procs; v++) {
 521                 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, v))) {
 522                     pptr = OBJ_NEW(orte_proc_t);
 523                     pptr->name.jobid = jdata->jobid;
 524                     pptr->name.vpid = v;
 525                     opal_pointer_array_set_item(jdata->procs, v, pptr);
 526                 }
 527                 cnt=1;
 528                 if (ORTE_SUCCESS != (rc = opal_dss.unpack(jptr, &dmnvpid, &cnt, ORTE_VPID))) {
 529                     ORTE_ERROR_LOG(rc);
 530                     OBJ_RELEASE(jptr);
 531                     OBJ_RELEASE(bptr);
 532                     goto REPORT_ERROR;
 533                 }
 534                 
 535                 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dmnvpid))) {
 536                     ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 537                     rc = ORTE_ERR_NOT_FOUND;
 538                     OBJ_RELEASE(jptr);
 539                     OBJ_RELEASE(bptr);
 540                     goto REPORT_ERROR;
 541                 }
 542                 
 543                 OBJ_RETAIN(dmn->node);
 544                 pptr->node = dmn->node;
 545             }
 546             
 547             OBJ_RELEASE(jptr);
 548             cnt = 1;
 549         }
 550         OBJ_RELEASE(bptr);
 551     }
 552 
 553     
 554     cnt=1;
 555     if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &cnt, ORTE_JOB))) {
 556         *job = ORTE_JOBID_INVALID;
 557         ORTE_ERROR_LOG(rc);
 558         goto REPORT_ERROR;
 559     }
 560     if (ORTE_JOBID_INVALID == jdata->jobid) {
 561         ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 562         rc = ORTE_ERR_BAD_PARAM;
 563         goto REPORT_ERROR;
 564     }
 565     *job = jdata->jobid;
 566 
 567     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
 568                          "%s odls:construct_child_list unpacking data to launch job %s",
 569                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
 570 
 571     
 572 
 573 
 574 
 575     if (ORTE_PROC_IS_HNP) {
 576         
 577 
 578 
 579 
 580 
 581         jdata->jobid = ORTE_JOBID_INVALID;
 582         OBJ_RELEASE(jdata);
 583         
 584         if (NULL == (jdata = orte_get_job_data_object(*job))) {
 585             ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 586             rc = ORTE_ERR_NOT_FOUND;
 587             goto REPORT_ERROR;
 588         }
 589     } else {
 590         opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
 591 
 592         
 593         if (NULL == jdata->map) {
 594             jdata->map = OBJ_NEW(orte_job_map_t);
 595         }
 596     }
 597 
 598     
 599 
 600 
 601     if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
 602         
 603 
 604 
 605         if (ORTE_SUCCESS != (rc = orte_util_decode_ppn(jdata, buffer))) {
 606             ORTE_ERROR_LOG(rc);
 607             goto REPORT_ERROR;
 608         }
 609 
 610         if (!ORTE_PROC_IS_HNP) {
 611             
 612             if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
 613                 ORTE_ERROR_LOG(rc);
 614                 goto REPORT_ERROR;
 615             }
 616         }
 617 
 618         
 619 
 620         if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
 621             ORTE_ERROR_LOG(rc);
 622             goto REPORT_ERROR;
 623         }
 624         
 625         if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
 626             ORTE_ERROR_LOG(rc);
 627             goto REPORT_ERROR;
 628         }
 629     }
 630 
 631     
 632 
 633     cnt=1;
 634     rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER);
 635     if (OPAL_SUCCESS == rc) {
 636         
 637         cnt=1;
 638         OBJ_CONSTRUCT(&cache, opal_list_t);
 639         while (ORTE_SUCCESS == (rc = opal_dss.unpack(bptr, &kv, &cnt, OPAL_VALUE))) {
 640             
 641 
 642             if (0 == strcmp(kv->key, OPAL_PMIX_SET_ENVAR) ||
 643                 0 == strcmp(kv->key, OPAL_PMIX_ADD_ENVAR) ||
 644                 0 == strcmp(kv->key, OPAL_PMIX_UNSET_ENVAR) ||
 645                 0 == strcmp(kv->key, OPAL_PMIX_PREPEND_ENVAR) ||
 646                 0 == strcmp(kv->key, OPAL_PMIX_APPEND_ENVAR)) {
 647                 opal_output_verbose(5, orte_odls_base_framework.framework_output,
 648                                     "ORTE:ODLS ADDING ENVAR %s", kv->data.envar.envar);
 649                 opal_list_prepend(&cache, &kv->super);
 650             } else {
 651                 
 652                 opal_list_append(&local_support, &kv->super);
 653             }
 654         }
 655         OBJ_RELEASE(bptr);
 656         
 657         while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&cache))) {
 658             if (0 == strcmp(kv->key, OPAL_PMIX_SET_ENVAR)) {
 659                 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_SET_ENVAR,
 660                                        ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
 661             } else if (0 == strcmp(kv->key, OPAL_PMIX_ADD_ENVAR)) {
 662                 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_ADD_ENVAR,
 663                                        ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
 664             } else if (0 == strcmp(kv->key, OPAL_PMIX_UNSET_ENVAR)) {
 665                 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_UNSET_ENVAR,
 666                                        ORTE_ATTR_GLOBAL, kv->data.string, OPAL_STRING);
 667             } else if (0 == strcmp(kv->key, OPAL_PMIX_PREPEND_ENVAR)) {
 668                 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_PREPEND_ENVAR,
 669                                        ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
 670             } else if (0 == strcmp(kv->key, OPAL_PMIX_APPEND_ENVAR)) {
 671                 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_APPEND_ENVAR,
 672                                        ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
 673             }
 674             OBJ_RELEASE(kv);
 675         }
 676         OPAL_LIST_DESTRUCT(&cache);
 677     }
 678 
 679     
 680 
 681 
 682     for (n=0; n < jdata->procs->size; n++) {
 683         if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
 684             continue;
 685         }
 686         if (ORTE_PROC_STATE_UNDEF == pptr->state) {
 687             
 688             continue;
 689         }
 690         if (!ORTE_PROC_IS_HNP &&
 691             orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
 692             
 693 
 694             opal_output_verbose(5, orte_odls_base_framework.framework_output,
 695                                 "%s GETTING DAEMON FOR PROC %s WITH PARENT %s",
 696                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 697                                 ORTE_NAME_PRINT(&pptr->name),
 698                                 ORTE_VPID_PRINT(pptr->parent));
 699             if (ORTE_VPID_INVALID == pptr->parent) {
 700                 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
 701                 rc = ORTE_ERR_BAD_PARAM;
 702                 goto REPORT_ERROR;
 703             }
 704             
 705             if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, pptr->parent))) {
 706                 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
 707                 rc = ORTE_ERR_NOT_FOUND;
 708                 goto REPORT_ERROR;
 709             }
 710             OBJ_RETAIN(dmn->node);
 711             pptr->node = dmn->node;
 712             
 713             if (!ORTE_FLAG_TEST(pptr->node, ORTE_NODE_FLAG_MAPPED)) {
 714                 OBJ_RETAIN(pptr->node);
 715                 opal_pointer_array_add(jdata->map->nodes, pptr->node);
 716                 jdata->map->num_nodes++;
 717                 ORTE_FLAG_SET(pptr->node, ORTE_NODE_FLAG_MAPPED);
 718             }
 719             
 720             OBJ_RETAIN(pptr);
 721             opal_pointer_array_add(pptr->node->procs, pptr);
 722             pptr->node->num_procs++;
 723         }
 724         
 725         if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
 726             
 727             if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
 728                 
 729                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
 730                                      "%s[%s:%d] adding proc %s to my local list",
 731                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 732                                      __FILE__, __LINE__,
 733                                      ORTE_NAME_PRINT(&pptr->name)));
 734                 
 735                 jdata->num_local_procs++;
 736                 
 737                 OBJ_RETAIN(pptr);
 738                 ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
 739                 opal_pointer_array_add(orte_local_children, pptr);
 740             }
 741 
 742             
 743             if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
 744                 orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
 745             }
 746             
 747             app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
 748             ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
 749         }
 750     }
 751     if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
 752         
 753         for (n=0; n < jdata->map->nodes->size; n++) {
 754             if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
 755                 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
 756             }
 757         }
 758     }
 759 
 760     if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
 761         
 762         if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
 763             ORTE_ERROR_LOG(rc);
 764             goto REPORT_ERROR;
 765         }
 766     }
 767 
 768     
 769     if (jdata->map->display_map) {
 770         orte_rmaps_base_display_map(jdata);
 771     }
 772 
 773     
 774 
 775     if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace(jdata, false))) {
 776         ORTE_ERROR_LOG(rc);
 777         goto REPORT_ERROR;
 778     }
 779 
 780     
 781 
 782 
 783     if (0 < opal_list_get_size(&local_support) &&
 784         NULL != opal_pmix.server_setup_local_support) {
 785         if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_local_support(jdata->jobid, &local_support,
 786                                                                        ls_cbunc, &lock))) {
 787             ORTE_ERROR_LOG(rc);
 788             goto REPORT_ERROR;
 789         }
 790     } else {
 791         lock.active = false;  
 792     }
 793 
 794     
 795     orte_rtc.assign(jdata);
 796 
 797     
 798     orte_odls_base_start_threads(jdata);
 799 
 800     
 801 
 802 
 803 
 804 
 805     
 806     OPAL_PMIX_WAIT_THREAD(&lock);
 807     OPAL_PMIX_DESTRUCT_LOCK(&lock);
 808     OPAL_LIST_DESTRUCT(&local_support);
 809     return ORTE_SUCCESS;
 810 
 811   REPORT_ERROR:
 812     OPAL_PMIX_DESTRUCT_LOCK(&lock);
 813     OPAL_LIST_DESTRUCT(&local_support);
 814     
 815 
 816 
 817 
 818 
 819 
 820     ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_NEVER_LAUNCHED);
 821     return rc;
 822 }
 823 
 824 static int setup_path(orte_app_context_t *app, char **wdir)
 825 {
 826     int rc=ORTE_SUCCESS;
 827     char dir[MAXPATHLEN];
 828 
 829     if (!orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
 830         
 831 
 832 
 833 
 834         if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
 835             
 836             goto CLEANUP;
 837         }
 838 
 839         
 840 
 841 
 842 
 843 
 844 
 845 
 846 
 847 
 848 
 849 
 850 
 851 
 852         if (NULL == getcwd(dir, sizeof(dir))) {
 853             return ORTE_ERR_OUT_OF_RESOURCE;
 854         }
 855         *wdir = strdup(dir);
 856         opal_setenv("PWD", dir, true, &app->env);
 857         
 858         opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
 859     } else {
 860         *wdir = NULL;
 861     }
 862 
 863  CLEANUP:
 864     return rc;
 865 }
 866 
 867 
 868 
 869 
 870 
 871 static void timer_cb(int fd, short event, void *cbdata)
 872 {
 873     orte_timer_t *tm = (orte_timer_t*)cbdata;
 874     orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
 875 
 876     ORTE_ACQUIRE_OBJECT(tm);
 877 
 878     
 879     ll->retries++;
 880 
 881     
 882     opal_event_active(ll->ev, OPAL_EV_WRITE, 1);
 883 
 884     
 885     OBJ_RELEASE(tm);
 886 }
 887 
 888 static int compute_num_procs_alive(orte_jobid_t job)
 889 {
 890     int i;
 891     orte_proc_t *child;
 892     int num_procs_alive = 0;
 893 
 894     for (i=0; i < orte_local_children->size; i++) {
 895         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
 896             continue;
 897         }
 898         if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
 899             continue;
 900         }
 901         
 902 
 903 
 904         if (job == child->name.jobid) {
 905             continue;
 906         }
 907         num_procs_alive++;
 908     }
 909     return num_procs_alive;
 910 }
 911 
 912 void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
 913 {
 914     orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cbdata;
 915     orte_job_t *jobdat = cd->jdata;
 916     orte_app_context_t *app = cd->app;
 917     orte_proc_t *child = cd->child;
 918     int rc, i;
 919     bool found;
 920     orte_proc_state_t state;
 921 
 922     ORTE_ACQUIRE_OBJECT(cd);
 923 
 924     
 925     cd->env = opal_argv_copy(app->env);
 926 
 927     
 928 
 929 
 930     child->exit_code = 0;
 931     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
 932 
 933     
 934     if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
 935         ORTE_ERROR_LOG(rc);
 936         state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
 937         goto errorout;
 938     }
 939 
 940     
 941 
 942 
 943     if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
 944         ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
 945     } else {
 946         ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
 947     }
 948     child->pid = 0;
 949     if (NULL != child->rml_uri) {
 950         free(child->rml_uri);
 951         child->rml_uri = NULL;
 952     }
 953 
 954     
 955 
 956 
 957     if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
 958         ORTE_ERROR_LOG(rc);
 959         state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
 960         goto errorout;
 961     }
 962 
 963     
 964     if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
 965         opal_list_item_t *nmitem;
 966         orte_namelist_t *nm;
 967         
 968         found = false;
 969         for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
 970              nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
 971              nmitem = opal_list_get_next(nmitem)) {
 972             nm = (orte_namelist_t*)nmitem;
 973             if (ORTE_VPID_WILDCARD == nm->name.vpid ||
 974                 child->name.vpid == nm->name.vpid) {
 975                 
 976 
 977                 cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
 978                 
 979                 free(cd->argv[2]);
 980                 opal_asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
 981                 
 982                 for (i=0; NULL != app->argv[i]; i++) {
 983                     opal_argv_append_nosize(&cd->argv, app->argv[i]);
 984                 }
 985                 
 986                 cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
 987                 found = true;
 988                 break;
 989             } else if (jobdat->num_procs <= nm->name.vpid) {  
 990                 
 991                 orte_show_help("help-orte-odls-base.txt",
 992                                "orte-odls-base:xterm-rank-out-of-bounds",
 993                                true, orte_process_info.nodename,
 994                                nm->name.vpid, jobdat->num_procs);
 995                 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
 996                 goto errorout;
 997             }
 998         }
 999         if (!found) {
1000             cd->cmd = strdup(app->app);
1001             cd->argv = opal_argv_copy(app->argv);
1002         }
1003     } else if (NULL != orte_fork_agent) {
1004         
1005         cd->argv = opal_argv_copy(orte_fork_agent);
1006         
1007         for (i=0; NULL != app->argv[i]; i++) {
1008             opal_argv_append_nosize(&cd->argv, app->argv[i]);
1009         }
1010         cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
1011         if (NULL == cd->cmd) {
1012             orte_show_help("help-orte-odls-base.txt",
1013                            "orte-odls-base:fork-agent-not-found",
1014                            true, orte_process_info.nodename, orte_fork_agent[0]);
1015             state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1016             goto errorout;
1017         }
1018     } else {
1019         cd->cmd = strdup(app->app);
1020         cd->argv = opal_argv_copy(app->argv);
1021     }
1022 
1023     
1024     if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
1025         char *param;
1026         opal_asprintf(¶m, "%s-%d", cd->argv[0], (int)child->name.vpid);
1027         free(cd->argv[0]);
1028         cd->argv[0] = param;
1029     }
1030 
1031     opal_output_verbose(5, orte_odls_base_framework.framework_output,
1032                         "%s odls:launch spawning child %s",
1033                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1034                         ORTE_NAME_PRINT(&child->name));
1035 
1036     if (15 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
1037         
1038         opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
1039     }
1040 
1041     if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
1042         
1043         state = ORTE_PROC_STATE_FAILED_TO_START;
1044         goto errorout;
1045     }
1046 
1047     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
1048     OBJ_RELEASE(cd);
1049     return;
1050 
1051   errorout:
1052     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
1053     child->exit_code = rc;
1054     ORTE_ACTIVATE_PROC_STATE(&child->name, state);
1055     OBJ_RELEASE(cd);
1056 }
1057 
1058 void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
1059 {
1060     orte_app_context_t *app;
1061     orte_proc_t *child=NULL;
1062     int rc=ORTE_SUCCESS;
1063     char basedir[MAXPATHLEN];
1064     int j, idx;
1065     int total_num_local_procs = 0;
1066     orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
1067     orte_job_t *jobdat;
1068     orte_jobid_t job = caddy->job;
1069     orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
1070     bool index_argv;
1071     char *msg;
1072     orte_odls_spawn_caddy_t *cd;
1073     opal_event_base_t *evb;
1074     char *effective_dir = NULL;
1075     char **argvptr;
1076     char *pathenv = NULL, *mpiexec_pathenv = NULL;
1077     char *full_search;
1078 
1079     ORTE_ACQUIRE_OBJECT(caddy);
1080 
1081     opal_output_verbose(5, orte_odls_base_framework.framework_output,
1082                         "%s local:launch",
1083                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1084 
1085     
1086 
1087 
1088 
1089     if (NULL == getcwd(basedir, sizeof(basedir))) {
1090         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1091         goto ERROR_OUT;
1092     }
1093     
1094     if (NULL == (jobdat = orte_get_job_data_object(job))) {
1095         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1096         
1097 
1098 
1099         ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1100         goto ERROR_OUT;
1101     }
1102 
1103     
1104     if (0 == jobdat->num_local_procs) {
1105         
1106         opal_output_verbose(5, orte_odls_base_framework.framework_output,
1107                             "%s local:launch no local procs",
1108                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1109         goto GETOUT;
1110     }
1111 
1112     
1113     index_argv = orte_get_attribute(&jobdat->attributes, ORTE_JOB_INDEX_ARGV, NULL, OPAL_BOOL);
1114 
1115     
1116     total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
1117 
1118     
1119 
1120 
1121 
1122 
1123     if (0 < opal_sys_limits.num_procs) {
1124         OPAL_OUTPUT_VERBOSE((10,  orte_odls_base_framework.framework_output,
1125                              "%s checking limit on num procs %d #children needed %d",
1126                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1127                              opal_sys_limits.num_procs, total_num_local_procs));
1128         if (opal_sys_limits.num_procs < total_num_local_procs) {
1129             if (2 < caddy->retries) {
1130                 
1131                 ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1132                 goto ERROR_OUT;
1133             }
1134             
1135 
1136 
1137 
1138             ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1139             return;
1140         }
1141     }
1142 
1143     
1144 
1145 
1146 
1147 
1148     if (0 < opal_sys_limits.num_files) {
1149         int limit;
1150         limit = 4*total_num_local_procs + 6*jobdat->num_local_procs;
1151         OPAL_OUTPUT_VERBOSE((10,  orte_odls_base_framework.framework_output,
1152                              "%s checking limit on file descriptors %d need %d",
1153                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1154                              opal_sys_limits.num_files, limit));
1155         if (opal_sys_limits.num_files < limit) {
1156             if (2 < caddy->retries) {
1157                 
1158                 for (idx=0; idx < orte_local_children->size; idx++) {
1159                     if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1160                         continue;
1161                     }
1162                     if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1163                         child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1164                         ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1165                     }
1166                 }
1167                 goto ERROR_OUT;
1168             }
1169             
1170             ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1171             return;
1172         }
1173     }
1174 
1175     for (j=0; j < jobdat->apps->size; j++) {
1176         if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
1177             continue;
1178         }
1179 
1180         
1181         if (!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE)) {
1182             opal_output_verbose(5, orte_odls_base_framework.framework_output,
1183                                 "%s app %d not used on node",
1184                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
1185             continue;
1186         }
1187 
1188         
1189         if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
1190 
1191             OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1192                                  "%s odls:launch:setup_fork failed with error %s",
1193                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1194                                  ORTE_ERROR_NAME(rc)));
1195 
1196             
1197 
1198 
1199 
1200 
1201 
1202 
1203             
1204             for (idx=0; idx < orte_local_children->size; idx++) {
1205                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1206                     continue;
1207                 }
1208                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1209                     j == (int)child->app_idx) {
1210                     child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1211                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1212                 }
1213             }
1214             goto GETOUT;
1215         }
1216 
1217         
1218 
1219 
1220         if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
1221             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1222                                  "%s odls:launch:setup_path failed with error %s(%d)",
1223                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1224                                  ORTE_ERROR_NAME(rc), rc));
1225             
1226 
1227 
1228 
1229 
1230 
1231 
1232             
1233             for (idx=0; idx < orte_local_children->size; idx++) {
1234                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1235                     continue;
1236                 }
1237                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1238                     j == (int)child->app_idx) {
1239                     child->exit_code = rc;
1240                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1241                 }
1242             }
1243             goto GETOUT;
1244         }
1245 
1246         
1247         if (ORTE_SUCCESS != (rc = orte_filem.link_local_files(jobdat, app))) {
1248             
1249             for (idx=0; idx < orte_local_children->size; idx++) {
1250                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1251                     continue;
1252                 }
1253                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1254                     j == (int)child->app_idx) {
1255                     child->exit_code = rc;
1256                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1257                 }
1258             }
1259             goto GETOUT;
1260         }
1261 
1262         
1263         for (argvptr = app->env; *argvptr != NULL; argvptr++) {
1264             if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
1265                 mpiexec_pathenv = *argvptr + 15;
1266             }
1267             if (0 == strncmp("PATH=", *argvptr, 5)) {
1268                 pathenv = *argvptr + 5;
1269             }
1270         }
1271 
1272         
1273 
1274 
1275 
1276 
1277         if (NULL != mpiexec_pathenv) {
1278             argvptr = NULL;
1279             if (pathenv != NULL) {
1280                 opal_asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
1281             } else {
1282                 opal_asprintf(&full_search, "%s", mpiexec_pathenv);
1283             }
1284             opal_setenv("PATH", full_search, true, &argvptr);
1285             free(full_search);
1286         } else {
1287             argvptr = app->env;
1288         }
1289 
1290         rc = orte_util_check_context_app(app, argvptr);
1291         
1292         if (NULL != mpiexec_pathenv) {
1293             opal_argv_free(argvptr);
1294         }
1295         if (ORTE_SUCCESS != rc) {
1296             
1297             for (idx=0; idx < orte_local_children->size; idx++) {
1298                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1299                     continue;
1300                 }
1301                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1302                     j == (int)child->app_idx) {
1303                     child->exit_code = rc;
1304                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1305                 }
1306             }
1307             goto GETOUT;
1308         }
1309 
1310 
1311         
1312         opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
1313 
1314         
1315         if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
1316             orte_show_help("help-orte-odls-default.txt", "set limit", true,
1317                            orte_process_info.nodename, app,
1318                            __FILE__, __LINE__, msg);
1319             
1320             for (idx=0; idx < orte_local_children->size; idx++) {
1321                 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1322                     continue;
1323                 }
1324                 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1325                     j == (int)child->app_idx) {
1326                     child->exit_code = rc;
1327                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1328                 }
1329             }
1330             goto GETOUT;
1331         }
1332 
1333         
1334 
1335 
1336 
1337 
1338 
1339 
1340         if (0 != chdir(basedir)) {
1341             ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1342             goto GETOUT;
1343         }
1344 
1345         
1346         for (idx=0; idx < orte_local_children->size; idx++) {
1347             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1348                 continue;
1349             }
1350             
1351             if (j != (int)child->app_idx) {
1352                 continue;
1353             }
1354 
1355             
1356 
1357 
1358 
1359             if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1360 
1361                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1362                                      "%s odls:launch child %s has already been launched",
1363                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1364                                      ORTE_NAME_PRINT(&child->name)));
1365 
1366                 continue;
1367             }
1368             
1369 
1370 
1371             if (ORTE_PROC_STATE_INIT != child->state &&
1372                 ORTE_PROC_STATE_RESTART != child->state) {
1373                 continue;
1374             }
1375             
1376 
1377 
1378 
1379             if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1380 
1381                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1382                                      "%s odls:launch child %s is not in job %s being launched",
1383                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1384                                      ORTE_NAME_PRINT(&child->name),
1385                                      ORTE_JOBID_PRINT(job)));
1386 
1387                 continue;
1388             }
1389 
1390             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1391                                  "%s odls:launch working child %s",
1392                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1393                                  ORTE_NAME_PRINT(&child->name)));
1394 
1395             
1396             ++orte_odls_globals.next_base;
1397             if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1398                 orte_odls_globals.next_base = 0;
1399             }
1400             evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1401 
1402             
1403 
1404             ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
1405             orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
1406 
1407             
1408             cd = OBJ_NEW(orte_odls_spawn_caddy_t);
1409             if (NULL != effective_dir) {
1410                 cd->wdir = strdup(effective_dir);
1411             }
1412             cd->jdata = jobdat;
1413             cd->app = app;
1414             cd->child = child;
1415             cd->fork_local = fork_local;
1416             cd->index_argv = index_argv;
1417             
1418             cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
1419 
1420             
1421             if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
1422                  child->name.vpid == jobdat->stdin_target) {
1423                 cd->opts.connect_stdin = true;
1424             } else {
1425                 cd->opts.connect_stdin = false;
1426             }
1427             if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
1428                 ORTE_ERROR_LOG(rc);
1429                 child->exit_code = rc;
1430                 OBJ_RELEASE(cd);
1431                 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1432                 goto GETOUT;
1433             }
1434             if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
1435                 
1436                 rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
1437                 if (ORTE_SUCCESS != rc) {
1438                     ORTE_ERROR_LOG(rc);
1439                     OBJ_RELEASE(cd);
1440                     ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1441                     goto GETOUT;
1442                 }
1443             }
1444             opal_output_verbose(1, orte_odls_base_framework.framework_output,
1445                                 "%s odls:dispatch %s to thread %d",
1446                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1447                                 ORTE_NAME_PRINT(&child->name),
1448                                 orte_odls_globals.next_base);
1449             opal_event_set(evb, &cd->ev, -1,
1450                            OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
1451             opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
1452             opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
1453 
1454         }
1455         if (NULL != effective_dir) {
1456             free(effective_dir);
1457             effective_dir = NULL;
1458         }
1459     }
1460 
1461   GETOUT:
1462     if (NULL != effective_dir) {
1463         free(effective_dir);
1464         effective_dir = NULL;
1465     }
1466 
1467   ERROR_OUT:
1468     
1469     if (0 != chdir(basedir)) {
1470         ORTE_ERROR_LOG(ORTE_ERROR);
1471     }
1472     
1473     OBJ_RELEASE(caddy);
1474 }
1475 
1476 
1477 
1478 
1479 
1480 int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal,
1481                                               orte_odls_base_signal_local_fn_t signal_local)
1482 {
1483     int rc, i;
1484     orte_proc_t *child;
1485 
1486     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1487                          "%s odls: signaling proc %s",
1488                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1489                          (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
1490 
1491     
1492 
1493 
1494     if (NULL == proc) {
1495         rc = ORTE_SUCCESS;  
1496         for (i=0; i < orte_local_children->size; i++) {
1497             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1498                 continue;
1499             }
1500             if (0 == child->pid || !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1501                 
1502                 continue;
1503             }
1504             if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1505                 ORTE_ERROR_LOG(rc);
1506             }
1507         }
1508         return rc;
1509     }
1510 
1511     
1512     for (i=0; i < orte_local_children->size; i++) {
1513         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1514             continue;
1515         }
1516         if (OPAL_EQUAL == opal_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) {
1517             if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1518                 ORTE_ERROR_LOG(rc);
1519             }
1520             return rc;
1521         }
1522     }
1523 
1524     
1525 
1526 
1527     ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1528     return ORTE_ERR_NOT_FOUND;
1529 }
1530 
1531 
1532 
1533 
1534 
1535 void orte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
1536 {
1537     orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
1538     orte_proc_t *proc = t2->child;
1539     int i;
1540     orte_job_t *jobdat;
1541     orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
1542     orte_proc_t *cptr;
1543 
1544     opal_output_verbose(5, orte_odls_base_framework.framework_output,
1545                         "%s odls:wait_local_proc child process %s pid %ld terminated",
1546                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1547                         ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
1548 
1549     
1550 
1551 
1552 
1553 
1554     if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ALIVE)) {
1555         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1556                              "%s odls:waitpid_fired child %s was already dead exit code %d",
1557                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1558                              ORTE_NAME_PRINT(&proc->name),proc->exit_code));
1559         if (WIFEXITED(proc->exit_code)) {
1560             proc->exit_code = WEXITSTATUS(proc->exit_code);
1561             if (0 != proc->exit_code) {
1562                 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1563             }
1564         } else {
1565             if (WIFSIGNALED(proc->exit_code)) {
1566                 state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1567                 proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1568             }
1569         }
1570         goto MOVEON;
1571     }
1572 
1573     
1574 
1575     if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
1576         
1577 
1578 
1579         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1580                              "%s odls:waitpid_fired child %s died by call to abort",
1581                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1582                              ORTE_NAME_PRINT(&proc->name)));
1583         state = ORTE_PROC_STATE_CALLED_ABORT;
1584         
1585 
1586         ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1587         goto MOVEON;
1588     }
1589 
1590     
1591     if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) {
1592         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1593         goto MOVEON;
1594     }
1595 
1596     
1597 
1598 
1599     if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON))  {
1600         goto MOVEON;
1601     }
1602 
1603     
1604 
1605 
1606     if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
1607         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1608                              "%s odls:waitpid_fired child %s was ordered to die",
1609                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1610                              ORTE_NAME_PRINT(&proc->name)));
1611         
1612 
1613         ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1614         goto MOVEON;
1615     }
1616 
1617     
1618     if (WIFEXITED(proc->exit_code)) {
1619 
1620         
1621         proc->exit_code = WEXITSTATUS(proc->exit_code);
1622 
1623         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1624                              "%s odls:waitpid_fired child %s exit code %d",
1625                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1626                              ORTE_NAME_PRINT(&proc->name), proc->exit_code));
1627 
1628         
1629         state = ORTE_PROC_STATE_WAITPID_FIRED;
1630 
1631         
1632         if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_REG)) {
1633             if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_HAS_DEREG) ||
1634                 orte_allowed_exit_without_sync || 0 != proc->exit_code) {
1635                 
1636 
1637 
1638 
1639 
1640 
1641                 if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1642                     state = ORTE_PROC_STATE_TERM_NON_ZERO;
1643                     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1644                                          "%s odls:waitpid_fired child process %s terminated normally "
1645                                          "but with a non-zero exit status - it "
1646                                          "will be treated as an abnormal termination",
1647                                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1648                                          ORTE_NAME_PRINT(&proc->name)));
1649                 } else {
1650                     
1651                     state = ORTE_PROC_STATE_WAITPID_FIRED;
1652                 }
1653             } else {
1654                 
1655 
1656 
1657                 state = ORTE_PROC_STATE_TERM_WO_SYNC;
1658                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1659                                      "%s odls:waitpid_fired child process %s terminated normally "
1660                                      "but did not provide a required finalize sync - it "
1661                                      "will be treated as an abnormal termination",
1662                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1663                                      ORTE_NAME_PRINT(&proc->name)));
1664             }
1665         } else {
1666             
1667             for (i=0; i < orte_local_children->size; i++) {
1668                 if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1669                     continue;
1670                 }
1671                 if (cptr->name.jobid != proc->name.jobid) {
1672                     continue;
1673                 }
1674                 if (ORTE_FLAG_TEST(cptr, ORTE_PROC_FLAG_REG) && !orte_allowed_exit_without_sync) {
1675                     
1676 
1677 
1678 
1679                     if (0 != proc->exit_code) {
1680                         state = ORTE_PROC_STATE_TERM_NON_ZERO;
1681                         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1682                                              "%s odls:waitpid_fired child process %s terminated normally "
1683                                              "but with a non-zero exit status - it "
1684                                              "will be treated as an abnormal termination",
1685                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1686                                              ORTE_NAME_PRINT(&proc->name)));
1687                     } else {
1688                         state = ORTE_PROC_STATE_TERM_WO_SYNC;
1689                         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1690                                              "%s odls:waitpid_fired child process %s terminated normally "
1691                                              "but did not provide a required init sync - it "
1692                                              "will be treated as an abnormal termination",
1693                                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1694                                              ORTE_NAME_PRINT(&proc->name)));
1695                     }
1696                     goto MOVEON;
1697                 }
1698             }
1699             
1700 
1701 
1702 
1703             if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1704                 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1705             } else {
1706                 state = ORTE_PROC_STATE_WAITPID_FIRED;
1707             }
1708         }
1709 
1710         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1711                              "%s odls:waitpid_fired child process %s terminated %s",
1712                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1713                              ORTE_NAME_PRINT(&proc->name),
1714                              (0 == proc->exit_code) ? "normally" : "with non-zero status"));
1715     } else {
1716         
1717 
1718 
1719         state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1720         
1721 
1722 
1723 
1724 
1725 
1726 
1727 
1728 
1729         proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1730 
1731         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1732                              "%s odls:waitpid_fired child process %s terminated with signal",
1733                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1734                              ORTE_NAME_PRINT(&proc->name) ));
1735         
1736     }
1737 
1738  MOVEON:
1739     
1740     orte_wait_cb_cancel(proc);
1741     ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
1742     
1743     OBJ_RELEASE(t2);
1744 }
1745 
1746 typedef struct {
1747     opal_list_item_t super;
1748     orte_proc_t *child;
1749 } orte_odls_quick_caddy_t;
1750 static void qcdcon(orte_odls_quick_caddy_t *p)
1751 {
1752     p->child = NULL;
1753 }
1754 static void qcddes(orte_odls_quick_caddy_t *p)
1755 {
1756     if (NULL != p->child) {
1757         OBJ_RELEASE(p->child);
1758     }
1759 }
1760 OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
1761                    opal_list_item_t,
1762                    qcdcon, qcddes);
1763 
1764 int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1765                                             orte_odls_base_kill_local_fn_t kill_local)
1766 {
1767     orte_proc_t *child;
1768     opal_list_t procs_killed;
1769     orte_proc_t *proc, proctmp;
1770     int i, j;
1771     opal_pointer_array_t procarray, *procptr;
1772     bool do_cleanup;
1773     orte_odls_quick_caddy_t *cd;
1774 
1775     OBJ_CONSTRUCT(&procs_killed, opal_list_t);
1776 
1777     
1778     if (NULL == procs) {
1779         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1780                              "%s odls:kill_local_proc working on WILDCARD",
1781                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1782         OBJ_CONSTRUCT(&procarray, opal_pointer_array_t);
1783         opal_pointer_array_init(&procarray, 1, 1, 1);
1784         OBJ_CONSTRUCT(&proctmp, orte_proc_t);
1785         proctmp.name.jobid = ORTE_JOBID_WILDCARD;
1786         proctmp.name.vpid = ORTE_VPID_WILDCARD;
1787         opal_pointer_array_add(&procarray, &proctmp);
1788         procptr = &procarray;
1789         do_cleanup = true;
1790     } else {
1791         OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1792                              "%s odls:kill_local_proc working on provided array",
1793                              ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1794         procptr = procs;
1795         do_cleanup = false;
1796     }
1797 
1798     
1799     for (i=0; i < procptr->size; i++) {
1800         if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procptr, i))) {
1801             continue;
1802         }
1803         for (j=0; j < orte_local_children->size; j++) {
1804             if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, j))) {
1805                 continue;
1806             }
1807 
1808             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1809                                  "%s odls:kill_local_proc checking child process %s",
1810                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1811                                  ORTE_NAME_PRINT(&child->name)));
1812 
1813             
1814 
1815 
1816 
1817             if (ORTE_JOBID_WILDCARD != proc->name.jobid &&
1818                 proc->name.jobid != child->name.jobid) {
1819 
1820                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1821                                      "%s odls:kill_local_proc child %s is not part of job %s",
1822                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1823                                      ORTE_NAME_PRINT(&child->name),
1824                                      ORTE_JOBID_PRINT(proc->name.jobid)));
1825                 continue;
1826             }
1827 
1828             
1829 
1830 
1831             if (ORTE_VPID_WILDCARD != proc->name.vpid &&
1832                 proc->name.vpid != child->name.vpid) {
1833 
1834                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1835                                      "%s odls:kill_local_proc child %s is not covered by rank %s",
1836                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1837                                      ORTE_NAME_PRINT(&child->name),
1838                                      ORTE_VPID_PRINT(proc->name.vpid)));
1839                 continue;
1840             }
1841 
1842             
1843 
1844 
1845             if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE) || 0 == child->pid) {
1846 
1847                 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1848                                      "%s odls:kill_local_proc child %s is not alive",
1849                                      ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1850                                      ORTE_NAME_PRINT(&child->name)));
1851 
1852                 
1853 
1854 
1855                 if (ORTE_PROC_STATE_UNDEF == child->state ||
1856                     ORTE_PROC_STATE_INIT == child->state ||
1857                     ORTE_PROC_STATE_RUNNING == child->state) {
1858                     
1859 
1860 
1861                     child->state = ORTE_PROC_STATE_TERMINATED;
1862                     
1863 
1864 
1865                     ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
1866                     child->pid = 0;
1867                     goto CLEANUP;
1868                 } else {
1869                     continue;
1870                 }
1871             }
1872 
1873             
1874 
1875 
1876             if (NULL != orte_iof.close) {
1877                 orte_iof.close(&child->name, ORTE_IOF_STDIN);
1878             }
1879 
1880             
1881 
1882 
1883             orte_wait_cb_cancel(child);
1884 
1885             
1886 
1887 
1888 
1889             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1890                                  "%s SENDING SIGCONT TO %s",
1891                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1892                                  ORTE_NAME_PRINT(&child->name)));
1893             cd = OBJ_NEW(orte_odls_quick_caddy_t);
1894             OBJ_RETAIN(child);
1895             cd->child = child;
1896             opal_list_append(&procs_killed, &cd->super);
1897             kill_local(child->pid, SIGCONT);
1898             continue;
1899 
1900         CLEANUP:
1901             
1902             orte_session_dir_finalize(&child->name);
1903             
1904 
1905 
1906             if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1907                 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID)) {
1908                 ORTE_ACTIVATE_PROC_STATE(&child->name, child->state);
1909             }
1910         }
1911     }
1912 
1913     
1914 
1915     if (0 < opal_list_get_size(&procs_killed)) {
1916         sleep(orte_odls_globals.timeout_before_sigkill);
1917         
1918         OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1919             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1920                                  "%s SENDING SIGTERM TO %s",
1921                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1922                                  ORTE_NAME_PRINT(&cd->child->name)));
1923             kill_local(cd->child->pid, SIGTERM);
1924         }
1925         
1926         sleep(orte_odls_globals.timeout_before_sigkill);
1927         
1928         OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1929             OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1930                                  "%s SENDING SIGKILL TO %s",
1931                                  ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1932                                  ORTE_NAME_PRINT(&cd->child->name)));
1933             kill_local(cd->child->pid, SIGKILL);
1934             
1935 
1936 
1937             ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
1938 
1939             
1940 
1941 
1942 
1943             ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
1944             cd->child->pid = 0;
1945 
1946             
1947             cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD;  
1948 
1949             
1950             orte_session_dir_finalize(&cd->child->name);
1951             
1952 
1953 
1954             if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1955                 ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
1956                 ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
1957             }
1958         }
1959     }
1960     OPAL_LIST_DESTRUCT(&procs_killed);
1961 
1962     
1963     if (do_cleanup) {
1964         OBJ_DESTRUCT(&procarray);
1965         OBJ_DESTRUCT(&proctmp);
1966     }
1967 
1968     return ORTE_SUCCESS;
1969 }
1970 
1971 int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
1972                                   orte_process_name_t *proc)
1973 {
1974     int rc;
1975     orte_proc_t *child;
1976     opal_pstats_t stats, *statsptr;
1977     int i, j;
1978 
1979     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1980                          "%s odls:get_proc_stats for proc %s",
1981                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1982                          ORTE_NAME_PRINT(proc)));
1983 
1984     
1985     for (i=0; i < orte_local_children->size; i++) {
1986         if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1987             continue;
1988         }
1989 
1990         if (proc->jobid == child->name.jobid &&
1991             (proc->vpid == child->name.vpid ||
1992              ORTE_VPID_WILDCARD == proc->vpid)) { 
1993 
1994             OBJ_CONSTRUCT(&stats, opal_pstats_t);
1995             
1996             for (j=0; j < (int)strlen(orte_process_info.nodename) &&
1997                  j < OPAL_PSTAT_MAX_STRING_LEN-1 &&
1998                  orte_process_info.nodename[j] != '.'; j++) {
1999                 stats.node[j] = orte_process_info.nodename[j];
2000             }
2001             
2002             stats.rank = child->name.vpid;
2003             
2004             rc = opal_pstat.query(child->pid, &stats, NULL);
2005             if (ORTE_SUCCESS != rc) {
2006                 OBJ_DESTRUCT(&stats);
2007                 return rc;
2008             }
2009             if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, proc, 1, ORTE_NAME))) {
2010                 ORTE_ERROR_LOG(rc);
2011                 OBJ_DESTRUCT(&stats);
2012                 return rc;
2013             }
2014             statsptr = &stats;
2015             if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &statsptr, 1, OPAL_PSTAT))) {
2016                 ORTE_ERROR_LOG(rc);
2017                 OBJ_DESTRUCT(&stats);
2018                 return rc;
2019             }
2020             OBJ_DESTRUCT(&stats);
2021         }
2022     }
2023 
2024     return ORTE_SUCCESS;
2025 }
2026 
2027 int orte_odls_base_default_restart_proc(orte_proc_t *child,
2028                                         orte_odls_base_fork_local_proc_fn_t fork_local)
2029 {
2030     int rc;
2031     orte_app_context_t *app;
2032     orte_job_t *jobdat;
2033     char basedir[MAXPATHLEN];
2034     char *wdir = NULL;
2035     orte_odls_spawn_caddy_t *cd;
2036     opal_event_base_t *evb;
2037 
2038     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2039                          "%s odls:restart_proc for proc %s",
2040                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2041                          ORTE_NAME_PRINT(&child->name)));
2042 
2043     
2044 
2045 
2046 
2047     if (NULL == getcwd(basedir, sizeof(basedir))) {
2048         return ORTE_ERR_OUT_OF_RESOURCE;
2049     }
2050 
2051     
2052     if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) {
2053         
2054         ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
2055         return ORTE_ERR_NOT_FOUND;
2056     }
2057 
2058     child->state = ORTE_PROC_STATE_FAILED_TO_START;
2059     child->exit_code = 0;
2060     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
2061     ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
2062     child->pid = 0;
2063     if (NULL != child->rml_uri) {
2064         free(child->rml_uri);
2065         child->rml_uri = NULL;
2066     }
2067     app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);
2068 
2069     
2070     if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &app->env))) {
2071         ORTE_ERROR_LOG(rc);
2072         goto CLEANUP;
2073     }
2074 
2075     
2076     if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
2077         ORTE_ERROR_LOG(rc);
2078         if (NULL != wdir) {
2079             free(wdir);
2080         }
2081         goto CLEANUP;
2082     }
2083 
2084     
2085     cd = OBJ_NEW(orte_odls_spawn_caddy_t);
2086     if (NULL != wdir) {
2087         cd->wdir = strdup(wdir);
2088         free(wdir);
2089     }
2090     cd->jdata = jobdat;
2091     cd->app = app;
2092     cd->child = child;
2093     cd->fork_local = fork_local;
2094     
2095     cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
2096 
2097     
2098     if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
2099          child->name.vpid == jobdat->stdin_target) {
2100         cd->opts.connect_stdin = true;
2101     } else {
2102         cd->opts.connect_stdin = false;
2103     }
2104     if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
2105         ORTE_ERROR_LOG(rc);
2106         child->exit_code = rc;
2107         OBJ_RELEASE(cd);
2108         ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
2109         goto CLEANUP;
2110     }
2111     if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
2112         
2113         rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
2114         if (ORTE_SUCCESS != rc) {
2115             ORTE_ERROR_LOG(rc);
2116             OBJ_RELEASE(cd);
2117             ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
2118             goto CLEANUP;
2119         }
2120     }
2121     ++orte_odls_globals.next_base;
2122     if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
2123         orte_odls_globals.next_base = 0;
2124     }
2125     evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
2126     orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
2127 
2128     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2129                          "%s restarting app %s",
2130                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
2131 
2132     opal_event_set(evb, &cd->ev, -1,
2133                    OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
2134     opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
2135     opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
2136 
2137   CLEANUP:
2138     OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2139                          "%s odls:restart of proc %s %s",
2140                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2141                          ORTE_NAME_PRINT(&child->name),
2142                          (ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
2143 
2144     
2145 
2146 
2147 
2148 
2149 
2150 
2151     if (0 != chdir(basedir)) {
2152         ORTE_ERROR_LOG(ORTE_ERROR);
2153     }
2154 
2155     return rc;
2156 }