This source file includes following definitions.
- setup_cbfunc
- orte_odls_base_default_get_add_procs_data
- ls_cbunc
- orte_odls_base_default_construct_child_list
- setup_path
- timer_cb
- compute_num_procs_alive
- orte_odls_base_spawn_proc
- orte_odls_base_default_launch_local
- orte_odls_base_default_signal_local_procs
- orte_odls_base_default_wait_local_proc
- qcdcon
- qcddes
- orte_odls_base_default_kill_local_procs
- orte_odls_base_get_proc_stats
- orte_odls_base_default_restart_proc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30 #include "orte_config.h"
31 #include "orte/constants.h"
32 #include "orte/types.h"
33
34 #ifdef HAVE_SYS_WAIT_H
35 #include <sys/wait.h>
36 #endif
37 #include <errno.h>
38 #ifdef HAVE_SYS_STAT_H
39 #include <sys/stat.h>
40 #endif
41 #ifdef HAVE_SYS_PARAM_H
42 #include <sys/param.h>
43 #endif
44 #include <time.h>
45
46 #include <signal.h>
47
48 #include "opal_stdint.h"
49 #include "opal/util/opal_environ.h"
50 #include "opal/util/argv.h"
51 #include "opal/util/os_dirpath.h"
52 #include "opal/util/os_path.h"
53 #include "opal/util/path.h"
54 #include "opal/util/printf.h"
55 #include "opal/util/sys_limits.h"
56 #include "opal/dss/dss.h"
57 #include "opal/mca/hwloc/hwloc-internal.h"
58 #include "opal/mca/shmem/base/base.h"
59 #include "opal/mca/pstat/pstat.h"
60 #include "opal/mca/pmix/base/base.h"
61
62 #include "orte/mca/errmgr/errmgr.h"
63 #include "orte/mca/rml/rml.h"
64 #include "orte/mca/routed/routed.h"
65 #include "orte/mca/iof/iof.h"
66 #include "orte/mca/iof/base/iof_base_setup.h"
67 #include "orte/mca/ess/base/base.h"
68 #include "orte/mca/grpcomm/base/base.h"
69 #include "orte/mca/plm/base/base.h"
70 #include "orte/mca/rml/base/rml_contact.h"
71 #include "orte/mca/rmaps/rmaps_types.h"
72 #include "orte/mca/rmaps/base/base.h"
73 #include "orte/mca/rmaps/base/rmaps_private.h"
74 #include "orte/mca/rtc/rtc.h"
75 #include "orte/mca/schizo/schizo.h"
76 #include "orte/mca/state/state.h"
77 #include "orte/mca/filem/filem.h"
78
79 #include "orte/util/context_fns.h"
80 #include "orte/util/name_fns.h"
81 #include "orte/util/nidmap.h"
82 #include "orte/util/session_dir.h"
83 #include "orte/util/proc_info.h"
84 #include "orte/util/show_help.h"
85 #include "orte/util/threads.h"
86 #include "orte/runtime/orte_globals.h"
87 #include "orte/runtime/orte_wait.h"
88 #include "orte/orted/orted.h"
89 #include "orte/orted/pmix/pmix_server.h"
90
91 #if OPAL_ENABLE_FT_CR == 1
92 #include "orte/mca/snapc/snapc.h"
93 #include "orte/mca/snapc/base/base.h"
94 #include "orte/mca/sstore/sstore.h"
95 #include "orte/mca/sstore/base/base.h"
96 #include "opal/mca/crs/crs.h"
97 #include "opal/mca/crs/base/base.h"
98 #endif
99
100 #include "orte/mca/odls/base/base.h"
101 #include "orte/mca/odls/base/odls_private.h"
102
103 static void setup_cbfunc(int status,
104 opal_list_t *info,
105 void *provided_cbdata,
106 opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
107 {
108 orte_job_t *jdata = (orte_job_t*)provided_cbdata;
109 opal_value_t *kv;
110 opal_buffer_t cache, *bptr;
111 int rc = ORTE_SUCCESS;
112
113 OBJ_CONSTRUCT(&cache, opal_buffer_t);
114 if (NULL != info) {
115
116 OPAL_LIST_FOREACH(kv, info, opal_value_t) {
117 if (OPAL_SUCCESS != (rc = opal_dss.pack(&cache, &kv, 1, OPAL_VALUE))) {
118 ORTE_ERROR_LOG(rc);
119 }
120 }
121 }
122
123 bptr = &cache;
124 opal_dss.pack(&jdata->launch_msg, &bptr, 1, OPAL_BUFFER);
125 OBJ_DESTRUCT(&cache);
126
127
128 if (NULL != cbfunc) {
129 cbfunc(rc, cbdata);
130 }
131
132
133 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SEND_LAUNCH_MSG);
134
135 }
136
137
138
139 int orte_odls_base_default_get_add_procs_data(opal_buffer_t *buffer,
140 orte_jobid_t job)
141 {
142 int rc, v;
143 orte_job_t *jdata=NULL, *jptr;
144 orte_job_map_t *map=NULL;
145 opal_buffer_t *wireup, jobdata, priorjob;
146 opal_byte_object_t bo, *boptr;
147 int32_t numbytes;
148 int8_t flag;
149 void *nptr;
150 uint32_t key;
151 orte_proc_t *dmn, *proc;
152 opal_value_t *val = NULL, *kv;
153 opal_list_t *modex, ilist;
154 int n;
155
156
157 if (NULL == (jdata = orte_get_job_data_object(job))) {
158 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
159 return ORTE_ERR_BAD_PARAM;
160 }
161
162
163 map = jdata->map;
164
165 if (NULL == map) {
166 return ORTE_SUCCESS;
167 }
168
169
170
171
172 if (1 < orte_process_info.num_procs &&
173 (!orte_node_info_communicated ||
174 orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL))) {
175
176 flag = 1;
177 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
178
179 if (ORTE_SUCCESS != (rc = orte_util_nidmap_create(orte_node_pool, buffer))) {
180 ORTE_ERROR_LOG(rc);
181 return rc;
182 }
183
184
185 if (NULL == (jptr = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid))) {
186 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
187 return ORTE_ERR_BAD_PARAM;
188 }
189 wireup = OBJ_NEW(opal_buffer_t);
190
191 val = NULL;
192 if (opal_pmix.legacy_get()) {
193 if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
194 ORTE_ERROR_LOG(rc);
195 OBJ_RELEASE(wireup);
196 return rc;
197 } else {
198
199 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
200 ORTE_ERROR_LOG(rc);
201 OBJ_RELEASE(wireup);
202 return rc;
203 }
204
205 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
206 ORTE_ERROR_LOG(rc);
207 OBJ_RELEASE(wireup);
208 return rc;
209 }
210 OBJ_RELEASE(val);
211 }
212 } else {
213 if (OPAL_SUCCESS != (rc = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
214 ORTE_ERROR_LOG(rc);
215 OBJ_RELEASE(wireup);
216 return rc;
217 }
218
219 if (OPAL_PTR != val->type) {
220 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
221 OBJ_RELEASE(wireup);
222 return ORTE_ERR_NOT_FOUND;
223 }
224 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
225 ORTE_ERROR_LOG(rc);
226 OBJ_RELEASE(wireup);
227 return rc;
228 }
229 modex = (opal_list_t*)val->data.ptr;
230 numbytes = (int32_t)opal_list_get_size(modex);
231 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
232 ORTE_ERROR_LOG(rc);
233 OBJ_RELEASE(wireup);
234 return rc;
235 }
236 OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
237 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
238 ORTE_ERROR_LOG(rc);
239 OBJ_RELEASE(wireup);
240 return rc;
241 }
242 }
243 OPAL_LIST_RELEASE(modex);
244 OBJ_RELEASE(val);
245 }
246
247 for (v=1; v < jptr->procs->size; v++) {
248 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, v))) {
249 continue;
250 }
251 val = NULL;
252 if (opal_pmix.legacy_get()) {
253 if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, OPAL_PMIX_PROC_URI, NULL, &val)) || NULL == val) {
254 ORTE_ERROR_LOG(rc);
255 OBJ_RELEASE(buffer);
256 OBJ_RELEASE(wireup);
257 return rc;
258 } else {
259
260 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
261 ORTE_ERROR_LOG(rc);
262 OBJ_RELEASE(buffer);
263 OBJ_RELEASE(wireup);
264 return rc;
265 }
266
267 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &val->data.string, 1, OPAL_STRING))) {
268 ORTE_ERROR_LOG(rc);
269 OBJ_RELEASE(buffer);
270 OBJ_RELEASE(wireup);
271 return rc;
272 }
273 OBJ_RELEASE(val);
274 }
275 } else {
276 if (OPAL_SUCCESS != (rc = opal_pmix.get(&dmn->name, NULL, NULL, &val)) || NULL == val) {
277 ORTE_ERROR_LOG(rc);
278 OBJ_RELEASE(buffer);
279 return rc;
280 } else {
281
282 if (OPAL_PTR != val->type) {
283 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
284 OBJ_RELEASE(buffer);
285 return ORTE_ERR_NOT_FOUND;
286 }
287 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &dmn->name, 1, ORTE_NAME))) {
288 ORTE_ERROR_LOG(rc);
289 OBJ_RELEASE(buffer);
290 OBJ_RELEASE(wireup);
291 return rc;
292 }
293 modex = (opal_list_t*)val->data.ptr;
294 numbytes = (int32_t)opal_list_get_size(modex);
295 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &numbytes, 1, OPAL_INT32))) {
296 ORTE_ERROR_LOG(rc);
297 OBJ_RELEASE(buffer);
298 OBJ_RELEASE(wireup);
299 return rc;
300 }
301 OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
302 if (ORTE_SUCCESS != (rc = opal_dss.pack(wireup, &kv, 1, OPAL_VALUE))) {
303 ORTE_ERROR_LOG(rc);
304 OBJ_RELEASE(buffer);
305 OBJ_RELEASE(wireup);
306 return rc;
307 }
308 }
309 OPAL_LIST_RELEASE(modex);
310 OBJ_RELEASE(val);
311 }
312 }
313 }
314
315 opal_dss.unload(wireup, (void**)&bo.bytes, &numbytes);
316 OBJ_RELEASE(wireup);
317
318 bo.size = numbytes;
319 boptr = &bo;
320 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &boptr, 1, OPAL_BYTE_OBJECT))) {
321 ORTE_ERROR_LOG(rc);
322 return rc;
323 }
324
325 if (NULL != bo.bytes) {
326 free(bo.bytes);
327 }
328
329
330
331
332
333 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_LAUNCHED_DAEMONS, NULL, OPAL_BOOL)) {
334 flag = 1;
335 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
336 OBJ_CONSTRUCT(&jobdata, opal_buffer_t);
337 rc = opal_hash_table_get_first_key_uint32(orte_job_data, &key, (void **)&jptr, &nptr);
338 while (OPAL_SUCCESS == rc) {
339
340 if (NULL != jptr && jptr != jdata &&
341 ORTE_PROC_MY_NAME->jobid != jptr->jobid) {
342 OBJ_CONSTRUCT(&priorjob, opal_buffer_t);
343
344 if (ORTE_SUCCESS != (rc = opal_dss.pack(&priorjob, &jptr, 1, ORTE_JOB))) {
345 ORTE_ERROR_LOG(rc);
346 OBJ_DESTRUCT(&jobdata);
347 OBJ_DESTRUCT(&priorjob);
348 return rc;
349 }
350
351 for (n=0; n < jptr->procs->size; n++) {
352 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(jptr->procs, n))) {
353 continue;
354 }
355 if (ORTE_SUCCESS != (rc = opal_dss.pack(&priorjob, &proc->parent, 1, ORTE_VPID))) {
356 ORTE_ERROR_LOG(rc);
357 OBJ_DESTRUCT(&jobdata);
358 OBJ_DESTRUCT(&priorjob);
359 return rc;
360 }
361 }
362
363 wireup = &priorjob;
364 if (ORTE_SUCCESS != (rc = opal_dss.pack(&jobdata, &wireup, 1, OPAL_BUFFER))) {
365 ORTE_ERROR_LOG(rc);
366 OBJ_DESTRUCT(&jobdata);
367 OBJ_DESTRUCT(&priorjob);
368 return rc;
369 }
370 OBJ_DESTRUCT(&priorjob);
371 }
372 rc = opal_hash_table_get_next_key_uint32(orte_job_data, &key, (void **)&jptr, nptr, &nptr);
373 }
374
375 wireup = &jobdata;
376 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &wireup, 1, OPAL_BUFFER))) {
377 ORTE_ERROR_LOG(rc);
378 OBJ_DESTRUCT(&jobdata);
379 return rc;
380 }
381 OBJ_DESTRUCT(&jobdata);
382 } else {
383 flag = 0;
384 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
385 }
386 orte_node_info_communicated = true;
387 } else {
388
389 flag = 0;
390 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
391
392 flag = 0;
393 opal_dss.pack(buffer, &flag, 1, OPAL_INT8);
394 }
395
396
397 if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &jdata, 1, ORTE_JOB))) {
398 ORTE_ERROR_LOG(rc);
399 return rc;
400 }
401
402 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
403
404 if (ORTE_SUCCESS != (rc = orte_util_generate_ppn(jdata, buffer))) {
405 ORTE_ERROR_LOG(rc);
406 return rc;
407 }
408 }
409
410
411 if (NULL != opal_pmix.server_setup_application) {
412 OBJ_CONSTRUCT(&ilist, opal_list_t);
413
414 kv = OBJ_NEW(opal_value_t);
415 kv->key = strdup(OPAL_PMIX_ALLOC_NETWORK_ID);
416 kv->type = OPAL_STRING;
417 opal_asprintf(&kv->data.string, "%s.net", ORTE_JOBID_PRINT(jdata->jobid));
418 opal_list_append(&ilist, &kv->super);
419
420 kv = OBJ_NEW(opal_value_t);
421 kv->key = strdup(OPAL_PMIX_ALLOC_NETWORK_SEC_KEY);
422 kv->type = OPAL_BOOL;
423 kv->data.flag = true;
424 opal_list_append(&ilist, &kv->super);
425
426 kv = OBJ_NEW(opal_value_t);
427 kv->key = strdup(OPAL_PMIX_SETUP_APP_ENVARS);
428 kv->type = OPAL_BOOL;
429 kv->data.flag = true;
430 opal_list_append(&ilist, &kv->super);
431
432
433 if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_application(jdata->jobid, &ilist, setup_cbfunc, jdata))) {
434 ORTE_ERROR_LOG(rc);
435 }
436 OPAL_LIST_DESTRUCT(&ilist);
437 return rc;
438 }
439
440
441 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_SEND_LAUNCH_MSG);
442
443 return ORTE_SUCCESS;
444 }
445
446 static void ls_cbunc(int status, void *cbdata)
447 {
448 opal_pmix_lock_t *lock = (opal_pmix_lock_t*)cbdata;
449 OPAL_PMIX_WAKEUP_THREAD(lock);
450 }
451
452 int orte_odls_base_default_construct_child_list(opal_buffer_t *buffer,
453 orte_jobid_t *job)
454 {
455 int rc;
456 orte_std_cntr_t cnt;
457 orte_job_t *jdata=NULL, *daemons;
458 orte_node_t *node;
459 orte_vpid_t dmnvpid, v;
460 int32_t n;
461 opal_buffer_t *bptr, *jptr;
462 orte_proc_t *pptr, *dmn;
463 orte_app_context_t *app;
464 int8_t flag;
465 opal_value_t *kv;
466 opal_list_t local_support, cache;
467 opal_pmix_lock_t lock;
468
469 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
470 "%s odls:constructing child list",
471 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
472
473
474 *job = ORTE_JOBID_INVALID;
475
476 daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);
477 OPAL_PMIX_CONSTRUCT_LOCK(&lock);
478 OBJ_CONSTRUCT(&local_support, opal_list_t);
479
480
481 cnt=1;
482 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
483 ORTE_ERROR_LOG(rc);
484 goto REPORT_ERROR;
485 }
486
487 if (0 != flag) {
488
489 cnt=1;
490 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER))) {
491 *job = ORTE_JOBID_INVALID;
492 ORTE_ERROR_LOG(rc);
493 OBJ_RELEASE(bptr);
494 goto REPORT_ERROR;
495 }
496 cnt=1;
497 while (ORTE_SUCCESS == (rc = opal_dss.unpack(bptr, &jptr, &cnt, OPAL_BUFFER))) {
498
499 cnt=1;
500 if (ORTE_SUCCESS != (rc = opal_dss.unpack(jptr, &jdata, &cnt, ORTE_JOB))) {
501 *job = ORTE_JOBID_INVALID;
502 ORTE_ERROR_LOG(rc);
503 OBJ_RELEASE(bptr);
504 OBJ_RELEASE(jptr);
505 goto REPORT_ERROR;
506 }
507
508 if (NULL == orte_get_job_data_object(jdata->jobid)) {
509
510 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
511 } else {
512
513 jdata->jobid = ORTE_JOBID_INVALID;
514 OBJ_RELEASE(jdata);
515 OBJ_RELEASE(jptr);
516 cnt=1;
517 continue;
518 }
519
520 for (v=0; v < jdata->num_procs; v++) {
521 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, v))) {
522 pptr = OBJ_NEW(orte_proc_t);
523 pptr->name.jobid = jdata->jobid;
524 pptr->name.vpid = v;
525 opal_pointer_array_set_item(jdata->procs, v, pptr);
526 }
527 cnt=1;
528 if (ORTE_SUCCESS != (rc = opal_dss.unpack(jptr, &dmnvpid, &cnt, ORTE_VPID))) {
529 ORTE_ERROR_LOG(rc);
530 OBJ_RELEASE(jptr);
531 OBJ_RELEASE(bptr);
532 goto REPORT_ERROR;
533 }
534
535 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, dmnvpid))) {
536 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
537 rc = ORTE_ERR_NOT_FOUND;
538 OBJ_RELEASE(jptr);
539 OBJ_RELEASE(bptr);
540 goto REPORT_ERROR;
541 }
542
543 OBJ_RETAIN(dmn->node);
544 pptr->node = dmn->node;
545 }
546
547 OBJ_RELEASE(jptr);
548 cnt = 1;
549 }
550 OBJ_RELEASE(bptr);
551 }
552
553
554 cnt=1;
555 if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &jdata, &cnt, ORTE_JOB))) {
556 *job = ORTE_JOBID_INVALID;
557 ORTE_ERROR_LOG(rc);
558 goto REPORT_ERROR;
559 }
560 if (ORTE_JOBID_INVALID == jdata->jobid) {
561 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
562 rc = ORTE_ERR_BAD_PARAM;
563 goto REPORT_ERROR;
564 }
565 *job = jdata->jobid;
566
567 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
568 "%s odls:construct_child_list unpacking data to launch job %s",
569 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(*job)));
570
571
572
573
574
575 if (ORTE_PROC_IS_HNP) {
576
577
578
579
580
581 jdata->jobid = ORTE_JOBID_INVALID;
582 OBJ_RELEASE(jdata);
583
584 if (NULL == (jdata = orte_get_job_data_object(*job))) {
585 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
586 rc = ORTE_ERR_NOT_FOUND;
587 goto REPORT_ERROR;
588 }
589 } else {
590 opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);
591
592
593 if (NULL == jdata->map) {
594 jdata->map = OBJ_NEW(orte_job_map_t);
595 }
596 }
597
598
599
600
601 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
602
603
604
605 if (ORTE_SUCCESS != (rc = orte_util_decode_ppn(jdata, buffer))) {
606 ORTE_ERROR_LOG(rc);
607 goto REPORT_ERROR;
608 }
609
610 if (!ORTE_PROC_IS_HNP) {
611
612 if (ORTE_SUCCESS != (rc = orte_rmaps_base_assign_locations(jdata))) {
613 ORTE_ERROR_LOG(rc);
614 goto REPORT_ERROR;
615 }
616 }
617
618
619
620 if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
621 ORTE_ERROR_LOG(rc);
622 goto REPORT_ERROR;
623 }
624
625 if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_local_ranks(jdata))) {
626 ORTE_ERROR_LOG(rc);
627 goto REPORT_ERROR;
628 }
629 }
630
631
632
633 cnt=1;
634 rc = opal_dss.unpack(buffer, &bptr, &cnt, OPAL_BUFFER);
635 if (OPAL_SUCCESS == rc) {
636
637 cnt=1;
638 OBJ_CONSTRUCT(&cache, opal_list_t);
639 while (ORTE_SUCCESS == (rc = opal_dss.unpack(bptr, &kv, &cnt, OPAL_VALUE))) {
640
641
642 if (0 == strcmp(kv->key, OPAL_PMIX_SET_ENVAR) ||
643 0 == strcmp(kv->key, OPAL_PMIX_ADD_ENVAR) ||
644 0 == strcmp(kv->key, OPAL_PMIX_UNSET_ENVAR) ||
645 0 == strcmp(kv->key, OPAL_PMIX_PREPEND_ENVAR) ||
646 0 == strcmp(kv->key, OPAL_PMIX_APPEND_ENVAR)) {
647 opal_output_verbose(5, orte_odls_base_framework.framework_output,
648 "ORTE:ODLS ADDING ENVAR %s", kv->data.envar.envar);
649 opal_list_prepend(&cache, &kv->super);
650 } else {
651
652 opal_list_append(&local_support, &kv->super);
653 }
654 }
655 OBJ_RELEASE(bptr);
656
657 while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&cache))) {
658 if (0 == strcmp(kv->key, OPAL_PMIX_SET_ENVAR)) {
659 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_SET_ENVAR,
660 ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
661 } else if (0 == strcmp(kv->key, OPAL_PMIX_ADD_ENVAR)) {
662 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_ADD_ENVAR,
663 ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
664 } else if (0 == strcmp(kv->key, OPAL_PMIX_UNSET_ENVAR)) {
665 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_UNSET_ENVAR,
666 ORTE_ATTR_GLOBAL, kv->data.string, OPAL_STRING);
667 } else if (0 == strcmp(kv->key, OPAL_PMIX_PREPEND_ENVAR)) {
668 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_PREPEND_ENVAR,
669 ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
670 } else if (0 == strcmp(kv->key, OPAL_PMIX_APPEND_ENVAR)) {
671 orte_prepend_attribute(&jdata->attributes, ORTE_JOB_APPEND_ENVAR,
672 ORTE_ATTR_GLOBAL, &kv->data.envar, OPAL_ENVAR);
673 }
674 OBJ_RELEASE(kv);
675 }
676 OPAL_LIST_DESTRUCT(&cache);
677 }
678
679
680
681
682 for (n=0; n < jdata->procs->size; n++) {
683 if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, n))) {
684 continue;
685 }
686 if (ORTE_PROC_STATE_UNDEF == pptr->state) {
687
688 continue;
689 }
690 if (!ORTE_PROC_IS_HNP &&
691 orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
692
693
694 opal_output_verbose(5, orte_odls_base_framework.framework_output,
695 "%s GETTING DAEMON FOR PROC %s WITH PARENT %s",
696 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
697 ORTE_NAME_PRINT(&pptr->name),
698 ORTE_VPID_PRINT(pptr->parent));
699 if (ORTE_VPID_INVALID == pptr->parent) {
700 ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
701 rc = ORTE_ERR_BAD_PARAM;
702 goto REPORT_ERROR;
703 }
704
705 if (NULL == (dmn = (orte_proc_t*)opal_pointer_array_get_item(daemons->procs, pptr->parent))) {
706 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
707 rc = ORTE_ERR_NOT_FOUND;
708 goto REPORT_ERROR;
709 }
710 OBJ_RETAIN(dmn->node);
711 pptr->node = dmn->node;
712
713 if (!ORTE_FLAG_TEST(pptr->node, ORTE_NODE_FLAG_MAPPED)) {
714 OBJ_RETAIN(pptr->node);
715 opal_pointer_array_add(jdata->map->nodes, pptr->node);
716 jdata->map->num_nodes++;
717 ORTE_FLAG_SET(pptr->node, ORTE_NODE_FLAG_MAPPED);
718 }
719
720 OBJ_RETAIN(pptr);
721 opal_pointer_array_add(pptr->node->procs, pptr);
722 pptr->node->num_procs++;
723 }
724
725 if (pptr->parent == ORTE_PROC_MY_NAME->vpid) {
726
727 if (!ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_LOCAL)) {
728
729 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
730 "%s[%s:%d] adding proc %s to my local list",
731 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
732 __FILE__, __LINE__,
733 ORTE_NAME_PRINT(&pptr->name)));
734
735 jdata->num_local_procs++;
736
737 OBJ_RETAIN(pptr);
738 ORTE_FLAG_SET(pptr, ORTE_PROC_FLAG_LOCAL);
739 opal_pointer_array_add(orte_local_children, pptr);
740 }
741
742
743 if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
744 orte_set_attribute(&pptr->attributes, ORTE_PROC_NOBARRIER, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL);
745 }
746
747 app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, pptr->app_idx);
748 ORTE_FLAG_SET(app, ORTE_APP_FLAG_USED_ON_NODE);
749 }
750 }
751 if (orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
752
753 for (n=0; n < jdata->map->nodes->size; n++) {
754 if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(jdata->map->nodes, n))) {
755 ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED);
756 }
757 }
758 }
759
760 if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_FULLY_DESCRIBED, NULL, OPAL_BOOL)) {
761
762 if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_bindings(jdata))) {
763 ORTE_ERROR_LOG(rc);
764 goto REPORT_ERROR;
765 }
766 }
767
768
769 if (jdata->map->display_map) {
770 orte_rmaps_base_display_map(jdata);
771 }
772
773
774
775 if (ORTE_SUCCESS != (rc = orte_pmix_server_register_nspace(jdata, false))) {
776 ORTE_ERROR_LOG(rc);
777 goto REPORT_ERROR;
778 }
779
780
781
782
783 if (0 < opal_list_get_size(&local_support) &&
784 NULL != opal_pmix.server_setup_local_support) {
785 if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_local_support(jdata->jobid, &local_support,
786 ls_cbunc, &lock))) {
787 ORTE_ERROR_LOG(rc);
788 goto REPORT_ERROR;
789 }
790 } else {
791 lock.active = false;
792 }
793
794
795 orte_rtc.assign(jdata);
796
797
798 orte_odls_base_start_threads(jdata);
799
800
801
802
803
804
805
806 OPAL_PMIX_WAIT_THREAD(&lock);
807 OPAL_PMIX_DESTRUCT_LOCK(&lock);
808 OPAL_LIST_DESTRUCT(&local_support);
809 return ORTE_SUCCESS;
810
811 REPORT_ERROR:
812 OPAL_PMIX_DESTRUCT_LOCK(&lock);
813 OPAL_LIST_DESTRUCT(&local_support);
814
815
816
817
818
819
820 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_NEVER_LAUNCHED);
821 return rc;
822 }
823
824 static int setup_path(orte_app_context_t *app, char **wdir)
825 {
826 int rc=ORTE_SUCCESS;
827 char dir[MAXPATHLEN];
828
829 if (!orte_get_attribute(&app->attributes, ORTE_APP_SSNDIR_CWD, NULL, OPAL_BOOL)) {
830
831
832
833
834 if (ORTE_SUCCESS != (rc = orte_util_check_context_cwd(app, true))) {
835
836 goto CLEANUP;
837 }
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852 if (NULL == getcwd(dir, sizeof(dir))) {
853 return ORTE_ERR_OUT_OF_RESOURCE;
854 }
855 *wdir = strdup(dir);
856 opal_setenv("PWD", dir, true, &app->env);
857
858 opal_setenv(OPAL_MCA_PREFIX"initial_wdir", dir, true, &app->env);
859 } else {
860 *wdir = NULL;
861 }
862
863 CLEANUP:
864 return rc;
865 }
866
867
868
869
870
871 static void timer_cb(int fd, short event, void *cbdata)
872 {
873 orte_timer_t *tm = (orte_timer_t*)cbdata;
874 orte_odls_launch_local_t *ll = (orte_odls_launch_local_t*)tm->payload;
875
876 ORTE_ACQUIRE_OBJECT(tm);
877
878
879 ll->retries++;
880
881
882 opal_event_active(ll->ev, OPAL_EV_WRITE, 1);
883
884
885 OBJ_RELEASE(tm);
886 }
887
888 static int compute_num_procs_alive(orte_jobid_t job)
889 {
890 int i;
891 orte_proc_t *child;
892 int num_procs_alive = 0;
893
894 for (i=0; i < orte_local_children->size; i++) {
895 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
896 continue;
897 }
898 if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
899 continue;
900 }
901
902
903
904 if (job == child->name.jobid) {
905 continue;
906 }
907 num_procs_alive++;
908 }
909 return num_procs_alive;
910 }
911
912 void orte_odls_base_spawn_proc(int fd, short sd, void *cbdata)
913 {
914 orte_odls_spawn_caddy_t *cd = (orte_odls_spawn_caddy_t*)cbdata;
915 orte_job_t *jobdat = cd->jdata;
916 orte_app_context_t *app = cd->app;
917 orte_proc_t *child = cd->child;
918 int rc, i;
919 bool found;
920 orte_proc_state_t state;
921
922 ORTE_ACQUIRE_OBJECT(cd);
923
924
925 cd->env = opal_argv_copy(app->env);
926
927
928
929
930 child->exit_code = 0;
931 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
932
933
934 if (OPAL_SUCCESS != (rc = opal_pmix.server_setup_fork(&child->name, &cd->env))) {
935 ORTE_ERROR_LOG(rc);
936 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
937 goto errorout;
938 }
939
940
941
942
943 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
944 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
945 } else {
946 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
947 }
948 child->pid = 0;
949 if (NULL != child->rml_uri) {
950 free(child->rml_uri);
951 child->rml_uri = NULL;
952 }
953
954
955
956
957 if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &cd->env))) {
958 ORTE_ERROR_LOG(rc);
959 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
960 goto errorout;
961 }
962
963
964 if (NULL != orte_xterm && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
965 opal_list_item_t *nmitem;
966 orte_namelist_t *nm;
967
968 found = false;
969 for (nmitem = opal_list_get_first(&orte_odls_globals.xterm_ranks);
970 nmitem != opal_list_get_end(&orte_odls_globals.xterm_ranks);
971 nmitem = opal_list_get_next(nmitem)) {
972 nm = (orte_namelist_t*)nmitem;
973 if (ORTE_VPID_WILDCARD == nm->name.vpid ||
974 child->name.vpid == nm->name.vpid) {
975
976
977 cd->argv = opal_argv_copy(orte_odls_globals.xtermcmd);
978
979 free(cd->argv[2]);
980 opal_asprintf(&cd->argv[2], "Rank %s", ORTE_VPID_PRINT(child->name.vpid));
981
982 for (i=0; NULL != app->argv[i]; i++) {
983 opal_argv_append_nosize(&cd->argv, app->argv[i]);
984 }
985
986 cd->cmd = strdup(orte_odls_globals.xtermcmd[0]);
987 found = true;
988 break;
989 } else if (jobdat->num_procs <= nm->name.vpid) {
990
991 orte_show_help("help-orte-odls-base.txt",
992 "orte-odls-base:xterm-rank-out-of-bounds",
993 true, orte_process_info.nodename,
994 nm->name.vpid, jobdat->num_procs);
995 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
996 goto errorout;
997 }
998 }
999 if (!found) {
1000 cd->cmd = strdup(app->app);
1001 cd->argv = opal_argv_copy(app->argv);
1002 }
1003 } else if (NULL != orte_fork_agent) {
1004
1005 cd->argv = opal_argv_copy(orte_fork_agent);
1006
1007 for (i=0; NULL != app->argv[i]; i++) {
1008 opal_argv_append_nosize(&cd->argv, app->argv[i]);
1009 }
1010 cd->cmd = opal_path_findv(orte_fork_agent[0], X_OK, orte_launch_environ, NULL);
1011 if (NULL == cd->cmd) {
1012 orte_show_help("help-orte-odls-base.txt",
1013 "orte-odls-base:fork-agent-not-found",
1014 true, orte_process_info.nodename, orte_fork_agent[0]);
1015 state = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1016 goto errorout;
1017 }
1018 } else {
1019 cd->cmd = strdup(app->app);
1020 cd->argv = opal_argv_copy(app->argv);
1021 }
1022
1023
1024 if (cd->index_argv && !ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
1025 char *param;
1026 opal_asprintf(¶m, "%s-%d", cd->argv[0], (int)child->name.vpid);
1027 free(cd->argv[0]);
1028 cd->argv[0] = param;
1029 }
1030
1031 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1032 "%s odls:launch spawning child %s",
1033 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1034 ORTE_NAME_PRINT(&child->name));
1035
1036 if (15 < opal_output_get_verbosity(orte_odls_base_framework.framework_output)) {
1037
1038 opal_dss.dump(orte_odls_base_framework.framework_output, app, ORTE_APP_CONTEXT);
1039 }
1040
1041 if (ORTE_SUCCESS != (rc = cd->fork_local(cd))) {
1042
1043 state = ORTE_PROC_STATE_FAILED_TO_START;
1044 goto errorout;
1045 }
1046
1047 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_RUNNING);
1048 OBJ_RELEASE(cd);
1049 return;
1050
1051 errorout:
1052 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_ALIVE);
1053 child->exit_code = rc;
1054 ORTE_ACTIVATE_PROC_STATE(&child->name, state);
1055 OBJ_RELEASE(cd);
1056 }
1057
1058 void orte_odls_base_default_launch_local(int fd, short sd, void *cbdata)
1059 {
1060 orte_app_context_t *app;
1061 orte_proc_t *child=NULL;
1062 int rc=ORTE_SUCCESS;
1063 char basedir[MAXPATHLEN];
1064 int j, idx;
1065 int total_num_local_procs = 0;
1066 orte_odls_launch_local_t *caddy = (orte_odls_launch_local_t*)cbdata;
1067 orte_job_t *jobdat;
1068 orte_jobid_t job = caddy->job;
1069 orte_odls_base_fork_local_proc_fn_t fork_local = caddy->fork_local;
1070 bool index_argv;
1071 char *msg;
1072 orte_odls_spawn_caddy_t *cd;
1073 opal_event_base_t *evb;
1074 char *effective_dir = NULL;
1075 char **argvptr;
1076 char *pathenv = NULL, *mpiexec_pathenv = NULL;
1077 char *full_search;
1078
1079 ORTE_ACQUIRE_OBJECT(caddy);
1080
1081 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1082 "%s local:launch",
1083 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1084
1085
1086
1087
1088
1089 if (NULL == getcwd(basedir, sizeof(basedir))) {
1090 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1091 goto ERROR_OUT;
1092 }
1093
1094 if (NULL == (jobdat = orte_get_job_data_object(job))) {
1095 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1096
1097
1098
1099 ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1100 goto ERROR_OUT;
1101 }
1102
1103
1104 if (0 == jobdat->num_local_procs) {
1105
1106 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1107 "%s local:launch no local procs",
1108 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
1109 goto GETOUT;
1110 }
1111
1112
1113 index_argv = orte_get_attribute(&jobdat->attributes, ORTE_JOB_INDEX_ARGV, NULL, OPAL_BOOL);
1114
1115
1116 total_num_local_procs = compute_num_procs_alive(job) + jobdat->num_local_procs;
1117
1118
1119
1120
1121
1122
1123 if (0 < opal_sys_limits.num_procs) {
1124 OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1125 "%s checking limit on num procs %d #children needed %d",
1126 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1127 opal_sys_limits.num_procs, total_num_local_procs));
1128 if (opal_sys_limits.num_procs < total_num_local_procs) {
1129 if (2 < caddy->retries) {
1130
1131 ORTE_ACTIVATE_JOB_STATE(jobdat, ORTE_JOB_STATE_FAILED_TO_LAUNCH);
1132 goto ERROR_OUT;
1133 }
1134
1135
1136
1137
1138 ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1139 return;
1140 }
1141 }
1142
1143
1144
1145
1146
1147
1148 if (0 < opal_sys_limits.num_files) {
1149 int limit;
1150 limit = 4*total_num_local_procs + 6*jobdat->num_local_procs;
1151 OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1152 "%s checking limit on file descriptors %d need %d",
1153 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1154 opal_sys_limits.num_files, limit));
1155 if (opal_sys_limits.num_files < limit) {
1156 if (2 < caddy->retries) {
1157
1158 for (idx=0; idx < orte_local_children->size; idx++) {
1159 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1160 continue;
1161 }
1162 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1163 child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1164 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1165 }
1166 }
1167 goto ERROR_OUT;
1168 }
1169
1170 ORTE_DETECT_TIMEOUT(1000, 1000, -1, timer_cb, caddy);
1171 return;
1172 }
1173 }
1174
1175 for (j=0; j < jobdat->apps->size; j++) {
1176 if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, j))) {
1177 continue;
1178 }
1179
1180
1181 if (!ORTE_FLAG_TEST(app, ORTE_APP_FLAG_USED_ON_NODE)) {
1182 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1183 "%s app %d not used on node",
1184 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
1185 continue;
1186 }
1187
1188
1189 if (ORTE_SUCCESS != (rc = orte_schizo.setup_fork(jobdat, app))) {
1190
1191 OPAL_OUTPUT_VERBOSE((10, orte_odls_base_framework.framework_output,
1192 "%s odls:launch:setup_fork failed with error %s",
1193 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1194 ORTE_ERROR_NAME(rc)));
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204 for (idx=0; idx < orte_local_children->size; idx++) {
1205 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1206 continue;
1207 }
1208 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1209 j == (int)child->app_idx) {
1210 child->exit_code = ORTE_PROC_STATE_FAILED_TO_LAUNCH;
1211 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1212 }
1213 }
1214 goto GETOUT;
1215 }
1216
1217
1218
1219
1220 if (ORTE_SUCCESS != (rc = setup_path(app, &effective_dir))) {
1221 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1222 "%s odls:launch:setup_path failed with error %s(%d)",
1223 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1224 ORTE_ERROR_NAME(rc), rc));
1225
1226
1227
1228
1229
1230
1231
1232
1233 for (idx=0; idx < orte_local_children->size; idx++) {
1234 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1235 continue;
1236 }
1237 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1238 j == (int)child->app_idx) {
1239 child->exit_code = rc;
1240 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1241 }
1242 }
1243 goto GETOUT;
1244 }
1245
1246
1247 if (ORTE_SUCCESS != (rc = orte_filem.link_local_files(jobdat, app))) {
1248
1249 for (idx=0; idx < orte_local_children->size; idx++) {
1250 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1251 continue;
1252 }
1253 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1254 j == (int)child->app_idx) {
1255 child->exit_code = rc;
1256 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1257 }
1258 }
1259 goto GETOUT;
1260 }
1261
1262
1263 for (argvptr = app->env; *argvptr != NULL; argvptr++) {
1264 if (0 == strncmp("OMPI_exec_path=", *argvptr, 15)) {
1265 mpiexec_pathenv = *argvptr + 15;
1266 }
1267 if (0 == strncmp("PATH=", *argvptr, 5)) {
1268 pathenv = *argvptr + 5;
1269 }
1270 }
1271
1272
1273
1274
1275
1276
1277 if (NULL != mpiexec_pathenv) {
1278 argvptr = NULL;
1279 if (pathenv != NULL) {
1280 opal_asprintf(&full_search, "%s:%s", mpiexec_pathenv, pathenv);
1281 } else {
1282 opal_asprintf(&full_search, "%s", mpiexec_pathenv);
1283 }
1284 opal_setenv("PATH", full_search, true, &argvptr);
1285 free(full_search);
1286 } else {
1287 argvptr = app->env;
1288 }
1289
1290 rc = orte_util_check_context_app(app, argvptr);
1291
1292 if (NULL != mpiexec_pathenv) {
1293 opal_argv_free(argvptr);
1294 }
1295 if (ORTE_SUCCESS != rc) {
1296
1297 for (idx=0; idx < orte_local_children->size; idx++) {
1298 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1299 continue;
1300 }
1301 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1302 j == (int)child->app_idx) {
1303 child->exit_code = rc;
1304 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1305 }
1306 }
1307 goto GETOUT;
1308 }
1309
1310
1311
1312 opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &app->env);
1313
1314
1315 if (OPAL_SUCCESS != (rc = opal_util_init_sys_limits(&msg))) {
1316 orte_show_help("help-orte-odls-default.txt", "set limit", true,
1317 orte_process_info.nodename, app,
1318 __FILE__, __LINE__, msg);
1319
1320 for (idx=0; idx < orte_local_children->size; idx++) {
1321 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1322 continue;
1323 }
1324 if (OPAL_EQUAL == opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID) &&
1325 j == (int)child->app_idx) {
1326 child->exit_code = rc;
1327 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1328 }
1329 }
1330 goto GETOUT;
1331 }
1332
1333
1334
1335
1336
1337
1338
1339
1340 if (0 != chdir(basedir)) {
1341 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1342 goto GETOUT;
1343 }
1344
1345
1346 for (idx=0; idx < orte_local_children->size; idx++) {
1347 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, idx))) {
1348 continue;
1349 }
1350
1351 if (j != (int)child->app_idx) {
1352 continue;
1353 }
1354
1355
1356
1357
1358
1359 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1360
1361 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1362 "%s odls:launch child %s has already been launched",
1363 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1364 ORTE_NAME_PRINT(&child->name)));
1365
1366 continue;
1367 }
1368
1369
1370
1371 if (ORTE_PROC_STATE_INIT != child->state &&
1372 ORTE_PROC_STATE_RESTART != child->state) {
1373 continue;
1374 }
1375
1376
1377
1378
1379 if (OPAL_EQUAL != opal_dss.compare(&job, &(child->name.jobid), ORTE_JOBID)) {
1380
1381 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1382 "%s odls:launch child %s is not in job %s being launched",
1383 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1384 ORTE_NAME_PRINT(&child->name),
1385 ORTE_JOBID_PRINT(job)));
1386
1387 continue;
1388 }
1389
1390 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1391 "%s odls:launch working child %s",
1392 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1393 ORTE_NAME_PRINT(&child->name)));
1394
1395
1396 ++orte_odls_globals.next_base;
1397 if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
1398 orte_odls_globals.next_base = 0;
1399 }
1400 evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
1401
1402
1403
1404 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_ALIVE);
1405 orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
1406
1407
1408 cd = OBJ_NEW(orte_odls_spawn_caddy_t);
1409 if (NULL != effective_dir) {
1410 cd->wdir = strdup(effective_dir);
1411 }
1412 cd->jdata = jobdat;
1413 cd->app = app;
1414 cd->child = child;
1415 cd->fork_local = fork_local;
1416 cd->index_argv = index_argv;
1417
1418 cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
1419
1420
1421 if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
1422 child->name.vpid == jobdat->stdin_target) {
1423 cd->opts.connect_stdin = true;
1424 } else {
1425 cd->opts.connect_stdin = false;
1426 }
1427 if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
1428 ORTE_ERROR_LOG(rc);
1429 child->exit_code = rc;
1430 OBJ_RELEASE(cd);
1431 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1432 goto GETOUT;
1433 }
1434 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
1435
1436 rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
1437 if (ORTE_SUCCESS != rc) {
1438 ORTE_ERROR_LOG(rc);
1439 OBJ_RELEASE(cd);
1440 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
1441 goto GETOUT;
1442 }
1443 }
1444 opal_output_verbose(1, orte_odls_base_framework.framework_output,
1445 "%s odls:dispatch %s to thread %d",
1446 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1447 ORTE_NAME_PRINT(&child->name),
1448 orte_odls_globals.next_base);
1449 opal_event_set(evb, &cd->ev, -1,
1450 OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
1451 opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
1452 opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
1453
1454 }
1455 if (NULL != effective_dir) {
1456 free(effective_dir);
1457 effective_dir = NULL;
1458 }
1459 }
1460
1461 GETOUT:
1462 if (NULL != effective_dir) {
1463 free(effective_dir);
1464 effective_dir = NULL;
1465 }
1466
1467 ERROR_OUT:
1468
1469 if (0 != chdir(basedir)) {
1470 ORTE_ERROR_LOG(ORTE_ERROR);
1471 }
1472
1473 OBJ_RELEASE(caddy);
1474 }
1475
1476
1477
1478
1479
1480 int orte_odls_base_default_signal_local_procs(const orte_process_name_t *proc, int32_t signal,
1481 orte_odls_base_signal_local_fn_t signal_local)
1482 {
1483 int rc, i;
1484 orte_proc_t *child;
1485
1486 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1487 "%s odls: signaling proc %s",
1488 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1489 (NULL == proc) ? "NULL" : ORTE_NAME_PRINT(proc)));
1490
1491
1492
1493
1494 if (NULL == proc) {
1495 rc = ORTE_SUCCESS;
1496 for (i=0; i < orte_local_children->size; i++) {
1497 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1498 continue;
1499 }
1500 if (0 == child->pid || !ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
1501
1502 continue;
1503 }
1504 if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1505 ORTE_ERROR_LOG(rc);
1506 }
1507 }
1508 return rc;
1509 }
1510
1511
1512 for (i=0; i < orte_local_children->size; i++) {
1513 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1514 continue;
1515 }
1516 if (OPAL_EQUAL == opal_dss.compare(&(child->name), (orte_process_name_t*)proc, ORTE_NAME)) {
1517 if (ORTE_SUCCESS != (rc = signal_local(child->pid, (int)signal))) {
1518 ORTE_ERROR_LOG(rc);
1519 }
1520 return rc;
1521 }
1522 }
1523
1524
1525
1526
1527 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1528 return ORTE_ERR_NOT_FOUND;
1529 }
1530
1531
1532
1533
1534
1535 void orte_odls_base_default_wait_local_proc(int fd, short sd, void *cbdata)
1536 {
1537 orte_wait_tracker_t *t2 = (orte_wait_tracker_t*)cbdata;
1538 orte_proc_t *proc = t2->child;
1539 int i;
1540 orte_job_t *jobdat;
1541 orte_proc_state_t state=ORTE_PROC_STATE_WAITPID_FIRED;
1542 orte_proc_t *cptr;
1543
1544 opal_output_verbose(5, orte_odls_base_framework.framework_output,
1545 "%s odls:wait_local_proc child process %s pid %ld terminated",
1546 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1547 ORTE_NAME_PRINT(&proc->name), (long)proc->pid);
1548
1549
1550
1551
1552
1553
1554 if (!ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ALIVE)) {
1555 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1556 "%s odls:waitpid_fired child %s was already dead exit code %d",
1557 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1558 ORTE_NAME_PRINT(&proc->name),proc->exit_code));
1559 if (WIFEXITED(proc->exit_code)) {
1560 proc->exit_code = WEXITSTATUS(proc->exit_code);
1561 if (0 != proc->exit_code) {
1562 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1563 }
1564 } else {
1565 if (WIFSIGNALED(proc->exit_code)) {
1566 state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1567 proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1568 }
1569 }
1570 goto MOVEON;
1571 }
1572
1573
1574
1575 if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_ABORT)) {
1576
1577
1578
1579 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1580 "%s odls:waitpid_fired child %s died by call to abort",
1581 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1582 ORTE_NAME_PRINT(&proc->name)));
1583 state = ORTE_PROC_STATE_CALLED_ABORT;
1584
1585
1586 ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1587 goto MOVEON;
1588 }
1589
1590
1591 if (NULL == (jobdat = orte_get_job_data_object(proc->name.jobid))) {
1592 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
1593 goto MOVEON;
1594 }
1595
1596
1597
1598
1599 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) {
1600 goto MOVEON;
1601 }
1602
1603
1604
1605
1606 if (ORTE_PROC_STATE_KILLED_BY_CMD == proc->state) {
1607 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1608 "%s odls:waitpid_fired child %s was ordered to die",
1609 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1610 ORTE_NAME_PRINT(&proc->name)));
1611
1612
1613 ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_WAITPID);
1614 goto MOVEON;
1615 }
1616
1617
1618 if (WIFEXITED(proc->exit_code)) {
1619
1620
1621 proc->exit_code = WEXITSTATUS(proc->exit_code);
1622
1623 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1624 "%s odls:waitpid_fired child %s exit code %d",
1625 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1626 ORTE_NAME_PRINT(&proc->name), proc->exit_code));
1627
1628
1629 state = ORTE_PROC_STATE_WAITPID_FIRED;
1630
1631
1632 if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_REG)) {
1633 if (ORTE_FLAG_TEST(proc, ORTE_PROC_FLAG_HAS_DEREG) ||
1634 orte_allowed_exit_without_sync || 0 != proc->exit_code) {
1635
1636
1637
1638
1639
1640
1641 if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1642 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1643 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1644 "%s odls:waitpid_fired child process %s terminated normally "
1645 "but with a non-zero exit status - it "
1646 "will be treated as an abnormal termination",
1647 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1648 ORTE_NAME_PRINT(&proc->name)));
1649 } else {
1650
1651 state = ORTE_PROC_STATE_WAITPID_FIRED;
1652 }
1653 } else {
1654
1655
1656
1657 state = ORTE_PROC_STATE_TERM_WO_SYNC;
1658 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1659 "%s odls:waitpid_fired child process %s terminated normally "
1660 "but did not provide a required finalize sync - it "
1661 "will be treated as an abnormal termination",
1662 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1663 ORTE_NAME_PRINT(&proc->name)));
1664 }
1665 } else {
1666
1667 for (i=0; i < orte_local_children->size; i++) {
1668 if (NULL == (cptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1669 continue;
1670 }
1671 if (cptr->name.jobid != proc->name.jobid) {
1672 continue;
1673 }
1674 if (ORTE_FLAG_TEST(cptr, ORTE_PROC_FLAG_REG) && !orte_allowed_exit_without_sync) {
1675
1676
1677
1678
1679 if (0 != proc->exit_code) {
1680 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1681 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1682 "%s odls:waitpid_fired child process %s terminated normally "
1683 "but with a non-zero exit status - it "
1684 "will be treated as an abnormal termination",
1685 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1686 ORTE_NAME_PRINT(&proc->name)));
1687 } else {
1688 state = ORTE_PROC_STATE_TERM_WO_SYNC;
1689 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1690 "%s odls:waitpid_fired child process %s terminated normally "
1691 "but did not provide a required init sync - it "
1692 "will be treated as an abnormal termination",
1693 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1694 ORTE_NAME_PRINT(&proc->name)));
1695 }
1696 goto MOVEON;
1697 }
1698 }
1699
1700
1701
1702
1703 if (0 != proc->exit_code && orte_abort_non_zero_exit) {
1704 state = ORTE_PROC_STATE_TERM_NON_ZERO;
1705 } else {
1706 state = ORTE_PROC_STATE_WAITPID_FIRED;
1707 }
1708 }
1709
1710 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1711 "%s odls:waitpid_fired child process %s terminated %s",
1712 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1713 ORTE_NAME_PRINT(&proc->name),
1714 (0 == proc->exit_code) ? "normally" : "with non-zero status"));
1715 } else {
1716
1717
1718
1719 state = ORTE_PROC_STATE_ABORTED_BY_SIG;
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729 proc->exit_code = WTERMSIG(proc->exit_code) + 128;
1730
1731 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1732 "%s odls:waitpid_fired child process %s terminated with signal",
1733 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1734 ORTE_NAME_PRINT(&proc->name) ));
1735
1736 }
1737
1738 MOVEON:
1739
1740 orte_wait_cb_cancel(proc);
1741 ORTE_ACTIVATE_PROC_STATE(&proc->name, state);
1742
1743 OBJ_RELEASE(t2);
1744 }
1745
1746 typedef struct {
1747 opal_list_item_t super;
1748 orte_proc_t *child;
1749 } orte_odls_quick_caddy_t;
1750 static void qcdcon(orte_odls_quick_caddy_t *p)
1751 {
1752 p->child = NULL;
1753 }
1754 static void qcddes(orte_odls_quick_caddy_t *p)
1755 {
1756 if (NULL != p->child) {
1757 OBJ_RELEASE(p->child);
1758 }
1759 }
1760 OBJ_CLASS_INSTANCE(orte_odls_quick_caddy_t,
1761 opal_list_item_t,
1762 qcdcon, qcddes);
1763
1764 int orte_odls_base_default_kill_local_procs(opal_pointer_array_t *procs,
1765 orte_odls_base_kill_local_fn_t kill_local)
1766 {
1767 orte_proc_t *child;
1768 opal_list_t procs_killed;
1769 orte_proc_t *proc, proctmp;
1770 int i, j;
1771 opal_pointer_array_t procarray, *procptr;
1772 bool do_cleanup;
1773 orte_odls_quick_caddy_t *cd;
1774
1775 OBJ_CONSTRUCT(&procs_killed, opal_list_t);
1776
1777
1778 if (NULL == procs) {
1779 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1780 "%s odls:kill_local_proc working on WILDCARD",
1781 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1782 OBJ_CONSTRUCT(&procarray, opal_pointer_array_t);
1783 opal_pointer_array_init(&procarray, 1, 1, 1);
1784 OBJ_CONSTRUCT(&proctmp, orte_proc_t);
1785 proctmp.name.jobid = ORTE_JOBID_WILDCARD;
1786 proctmp.name.vpid = ORTE_VPID_WILDCARD;
1787 opal_pointer_array_add(&procarray, &proctmp);
1788 procptr = &procarray;
1789 do_cleanup = true;
1790 } else {
1791 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1792 "%s odls:kill_local_proc working on provided array",
1793 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
1794 procptr = procs;
1795 do_cleanup = false;
1796 }
1797
1798
1799 for (i=0; i < procptr->size; i++) {
1800 if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(procptr, i))) {
1801 continue;
1802 }
1803 for (j=0; j < orte_local_children->size; j++) {
1804 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, j))) {
1805 continue;
1806 }
1807
1808 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1809 "%s odls:kill_local_proc checking child process %s",
1810 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1811 ORTE_NAME_PRINT(&child->name)));
1812
1813
1814
1815
1816
1817 if (ORTE_JOBID_WILDCARD != proc->name.jobid &&
1818 proc->name.jobid != child->name.jobid) {
1819
1820 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1821 "%s odls:kill_local_proc child %s is not part of job %s",
1822 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1823 ORTE_NAME_PRINT(&child->name),
1824 ORTE_JOBID_PRINT(proc->name.jobid)));
1825 continue;
1826 }
1827
1828
1829
1830
1831 if (ORTE_VPID_WILDCARD != proc->name.vpid &&
1832 proc->name.vpid != child->name.vpid) {
1833
1834 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1835 "%s odls:kill_local_proc child %s is not covered by rank %s",
1836 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1837 ORTE_NAME_PRINT(&child->name),
1838 ORTE_VPID_PRINT(proc->name.vpid)));
1839 continue;
1840 }
1841
1842
1843
1844
1845 if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE) || 0 == child->pid) {
1846
1847 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1848 "%s odls:kill_local_proc child %s is not alive",
1849 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1850 ORTE_NAME_PRINT(&child->name)));
1851
1852
1853
1854
1855 if (ORTE_PROC_STATE_UNDEF == child->state ||
1856 ORTE_PROC_STATE_INIT == child->state ||
1857 ORTE_PROC_STATE_RUNNING == child->state) {
1858
1859
1860
1861 child->state = ORTE_PROC_STATE_TERMINATED;
1862
1863
1864
1865 ORTE_FLAG_SET(child, ORTE_PROC_FLAG_WAITPID);
1866 child->pid = 0;
1867 goto CLEANUP;
1868 } else {
1869 continue;
1870 }
1871 }
1872
1873
1874
1875
1876 if (NULL != orte_iof.close) {
1877 orte_iof.close(&child->name, ORTE_IOF_STDIN);
1878 }
1879
1880
1881
1882
1883 orte_wait_cb_cancel(child);
1884
1885
1886
1887
1888
1889 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1890 "%s SENDING SIGCONT TO %s",
1891 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1892 ORTE_NAME_PRINT(&child->name)));
1893 cd = OBJ_NEW(orte_odls_quick_caddy_t);
1894 OBJ_RETAIN(child);
1895 cd->child = child;
1896 opal_list_append(&procs_killed, &cd->super);
1897 kill_local(child->pid, SIGCONT);
1898 continue;
1899
1900 CLEANUP:
1901
1902 orte_session_dir_finalize(&child->name);
1903
1904
1905
1906 if (ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1907 ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_WAITPID)) {
1908 ORTE_ACTIVATE_PROC_STATE(&child->name, child->state);
1909 }
1910 }
1911 }
1912
1913
1914
1915 if (0 < opal_list_get_size(&procs_killed)) {
1916 sleep(orte_odls_globals.timeout_before_sigkill);
1917
1918 OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1919 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1920 "%s SENDING SIGTERM TO %s",
1921 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1922 ORTE_NAME_PRINT(&cd->child->name)));
1923 kill_local(cd->child->pid, SIGTERM);
1924 }
1925
1926 sleep(orte_odls_globals.timeout_before_sigkill);
1927
1928 OPAL_LIST_FOREACH(cd, &procs_killed, orte_odls_quick_caddy_t) {
1929 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1930 "%s SENDING SIGKILL TO %s",
1931 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1932 ORTE_NAME_PRINT(&cd->child->name)));
1933 kill_local(cd->child->pid, SIGKILL);
1934
1935
1936
1937 ORTE_FLAG_SET(cd->child, ORTE_PROC_FLAG_WAITPID);
1938
1939
1940
1941
1942
1943 ORTE_FLAG_UNSET(cd->child, ORTE_PROC_FLAG_ALIVE);
1944 cd->child->pid = 0;
1945
1946
1947 cd->child->state = ORTE_PROC_STATE_KILLED_BY_CMD;
1948
1949
1950 orte_session_dir_finalize(&cd->child->name);
1951
1952
1953
1954 if (ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_IOF_COMPLETE) &&
1955 ORTE_FLAG_TEST(cd->child, ORTE_PROC_FLAG_WAITPID)) {
1956 ORTE_ACTIVATE_PROC_STATE(&cd->child->name, cd->child->state);
1957 }
1958 }
1959 }
1960 OPAL_LIST_DESTRUCT(&procs_killed);
1961
1962
1963 if (do_cleanup) {
1964 OBJ_DESTRUCT(&procarray);
1965 OBJ_DESTRUCT(&proctmp);
1966 }
1967
1968 return ORTE_SUCCESS;
1969 }
1970
1971 int orte_odls_base_get_proc_stats(opal_buffer_t *answer,
1972 orte_process_name_t *proc)
1973 {
1974 int rc;
1975 orte_proc_t *child;
1976 opal_pstats_t stats, *statsptr;
1977 int i, j;
1978
1979 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
1980 "%s odls:get_proc_stats for proc %s",
1981 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
1982 ORTE_NAME_PRINT(proc)));
1983
1984
1985 for (i=0; i < orte_local_children->size; i++) {
1986 if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
1987 continue;
1988 }
1989
1990 if (proc->jobid == child->name.jobid &&
1991 (proc->vpid == child->name.vpid ||
1992 ORTE_VPID_WILDCARD == proc->vpid)) {
1993
1994 OBJ_CONSTRUCT(&stats, opal_pstats_t);
1995
1996 for (j=0; j < (int)strlen(orte_process_info.nodename) &&
1997 j < OPAL_PSTAT_MAX_STRING_LEN-1 &&
1998 orte_process_info.nodename[j] != '.'; j++) {
1999 stats.node[j] = orte_process_info.nodename[j];
2000 }
2001
2002 stats.rank = child->name.vpid;
2003
2004 rc = opal_pstat.query(child->pid, &stats, NULL);
2005 if (ORTE_SUCCESS != rc) {
2006 OBJ_DESTRUCT(&stats);
2007 return rc;
2008 }
2009 if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, proc, 1, ORTE_NAME))) {
2010 ORTE_ERROR_LOG(rc);
2011 OBJ_DESTRUCT(&stats);
2012 return rc;
2013 }
2014 statsptr = &stats;
2015 if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &statsptr, 1, OPAL_PSTAT))) {
2016 ORTE_ERROR_LOG(rc);
2017 OBJ_DESTRUCT(&stats);
2018 return rc;
2019 }
2020 OBJ_DESTRUCT(&stats);
2021 }
2022 }
2023
2024 return ORTE_SUCCESS;
2025 }
2026
2027 int orte_odls_base_default_restart_proc(orte_proc_t *child,
2028 orte_odls_base_fork_local_proc_fn_t fork_local)
2029 {
2030 int rc;
2031 orte_app_context_t *app;
2032 orte_job_t *jobdat;
2033 char basedir[MAXPATHLEN];
2034 char *wdir = NULL;
2035 orte_odls_spawn_caddy_t *cd;
2036 opal_event_base_t *evb;
2037
2038 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2039 "%s odls:restart_proc for proc %s",
2040 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2041 ORTE_NAME_PRINT(&child->name)));
2042
2043
2044
2045
2046
2047 if (NULL == getcwd(basedir, sizeof(basedir))) {
2048 return ORTE_ERR_OUT_OF_RESOURCE;
2049 }
2050
2051
2052 if (NULL == (jobdat = orte_get_job_data_object(child->name.jobid))) {
2053
2054 ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
2055 return ORTE_ERR_NOT_FOUND;
2056 }
2057
2058 child->state = ORTE_PROC_STATE_FAILED_TO_START;
2059 child->exit_code = 0;
2060 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_WAITPID);
2061 ORTE_FLAG_UNSET(child, ORTE_PROC_FLAG_IOF_COMPLETE);
2062 child->pid = 0;
2063 if (NULL != child->rml_uri) {
2064 free(child->rml_uri);
2065 child->rml_uri = NULL;
2066 }
2067 app = (orte_app_context_t*)opal_pointer_array_get_item(jobdat->apps, child->app_idx);
2068
2069
2070 if (ORTE_SUCCESS != (rc = orte_schizo.setup_child(jobdat, child, app, &app->env))) {
2071 ORTE_ERROR_LOG(rc);
2072 goto CLEANUP;
2073 }
2074
2075
2076 if (ORTE_SUCCESS != (rc = setup_path(app, &wdir))) {
2077 ORTE_ERROR_LOG(rc);
2078 if (NULL != wdir) {
2079 free(wdir);
2080 }
2081 goto CLEANUP;
2082 }
2083
2084
2085 cd = OBJ_NEW(orte_odls_spawn_caddy_t);
2086 if (NULL != wdir) {
2087 cd->wdir = strdup(wdir);
2088 free(wdir);
2089 }
2090 cd->jdata = jobdat;
2091 cd->app = app;
2092 cd->child = child;
2093 cd->fork_local = fork_local;
2094
2095 cd->opts.usepty = OPAL_ENABLE_PTY_SUPPORT;
2096
2097
2098 if (jobdat->stdin_target == ORTE_VPID_WILDCARD ||
2099 child->name.vpid == jobdat->stdin_target) {
2100 cd->opts.connect_stdin = true;
2101 } else {
2102 cd->opts.connect_stdin = false;
2103 }
2104 if (ORTE_SUCCESS != (rc = orte_iof_base_setup_prefork(&cd->opts))) {
2105 ORTE_ERROR_LOG(rc);
2106 child->exit_code = rc;
2107 OBJ_RELEASE(cd);
2108 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
2109 goto CLEANUP;
2110 }
2111 if (ORTE_FLAG_TEST(jobdat, ORTE_JOB_FLAG_FORWARD_OUTPUT)) {
2112
2113 rc = orte_iof_base_setup_parent(&child->name, &cd->opts);
2114 if (ORTE_SUCCESS != rc) {
2115 ORTE_ERROR_LOG(rc);
2116 OBJ_RELEASE(cd);
2117 ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
2118 goto CLEANUP;
2119 }
2120 }
2121 ++orte_odls_globals.next_base;
2122 if (orte_odls_globals.num_threads <= orte_odls_globals.next_base) {
2123 orte_odls_globals.next_base = 0;
2124 }
2125 evb = orte_odls_globals.ev_bases[orte_odls_globals.next_base];
2126 orte_wait_cb(child, orte_odls_base_default_wait_local_proc, evb, NULL);
2127
2128 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2129 "%s restarting app %s",
2130 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), app->app));
2131
2132 opal_event_set(evb, &cd->ev, -1,
2133 OPAL_EV_WRITE, orte_odls_base_spawn_proc, cd);
2134 opal_event_set_priority(&cd->ev, ORTE_MSG_PRI);
2135 opal_event_active(&cd->ev, OPAL_EV_WRITE, 1);
2136
2137 CLEANUP:
2138 OPAL_OUTPUT_VERBOSE((5, orte_odls_base_framework.framework_output,
2139 "%s odls:restart of proc %s %s",
2140 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
2141 ORTE_NAME_PRINT(&child->name),
2142 (ORTE_SUCCESS == rc) ? "succeeded" : "failed"));
2143
2144
2145
2146
2147
2148
2149
2150
2151 if (0 != chdir(basedir)) {
2152 ORTE_ERROR_LOG(ORTE_ERROR);
2153 }
2154
2155 return rc;
2156 }