This source file includes following definitions.
- orte_errmgr_base_log
- orte_errmgr_base_abort
- orte_errmgr_base_abort_peers
- orte_errmgr_base_update_app_context_for_cr_recovery
- orte_errmgr_base_restart_job
- orte_errmgr_base_migrate_job
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include "orte_config.h"
28 #include "orte/constants.h"
29
30 #include <string.h>
31 #if HAVE_SYS_TYPES_H
32 #include <sys/types.h>
33 #endif
34 #ifdef HAVE_UNISTD_H
35 #include <unistd.h>
36 #endif
37 #if HAVE_SYS_TYPES_H
38 #include <sys/types.h>
39 #endif
40 #if HAVE_SYS_STAT_H
41 #include <sys/stat.h>
42 #endif
43 #ifdef HAVE_DIRENT_H
44 #include <dirent.h>
45 #endif
46 #include <time.h>
47
48 #include <stdlib.h>
49 #include <stdarg.h>
50
51 #include "orte/mca/mca.h"
52 #include "opal/mca/base/base.h"
53 #include "opal/util/os_dirpath.h"
54 #include "opal/util/output.h"
55 #include "opal/util/printf.h"
56 #include "opal/util/basename.h"
57 #include "opal/util/argv.h"
58 #include "opal/mca/crs/crs.h"
59 #include "opal/mca/crs/base/base.h"
60
61 #include "orte/util/name_fns.h"
62 #include "orte/util/session_dir.h"
63 #include "orte/util/proc_info.h"
64
65 #include "orte/runtime/orte_globals.h"
66 #include "orte/runtime/runtime.h"
67 #include "orte/runtime/orte_wait.h"
68 #include "orte/runtime/orte_locks.h"
69
70 #include "orte/mca/ess/ess.h"
71 #include "orte/mca/state/state.h"
72 #include "orte/mca/odls/odls.h"
73 #include "orte/mca/plm/plm.h"
74 #include "orte/mca/rml/rml.h"
75 #include "orte/mca/rml/rml_types.h"
76 #include "orte/mca/routed/routed.h"
77 #include "orte/mca/snapc/snapc.h"
78 #include "orte/mca/snapc/base/base.h"
79 #include "orte/mca/sstore/sstore.h"
80 #include "orte/mca/sstore/base/base.h"
81
82 #include "orte/mca/errmgr/errmgr.h"
83 #include "orte/mca/errmgr/base/base.h"
84 #include "orte/mca/errmgr/base/errmgr_private.h"
85
86
87
88
89 void orte_errmgr_base_log(int error_code, char *filename, int line)
90 {
91 char *errstring = NULL;
92
93 errstring = (char*)ORTE_ERROR_NAME(error_code);
94
95 if (NULL == errstring) {
96
97 return;
98 }
99
100 opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
101 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
102 errstring, filename, line);
103 }
104
105 void orte_errmgr_base_abort(int error_code, char *fmt, ...)
106 {
107 va_list arglist;
108
109
110 va_start(arglist, fmt);
111 if( NULL != fmt ) {
112 char* buffer = NULL;
113 opal_vasprintf( &buffer, fmt, arglist );
114 opal_output( 0, "%s", buffer );
115 free( buffer );
116 }
117 va_end(arglist);
118
119
120 if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
121
122 orte_odls.kill_local_procs(NULL);
123
124 orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
125 }
126
127
128 if (ORTE_ERR_CONNECTION_FAILED == error_code ||
129 ORTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
130 orte_ess.abort(error_code, false);
131 } else {
132 orte_ess.abort(error_code, true);
133 }
134
135
136
137
138
139
140 }
141
142 int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
143 orte_std_cntr_t num_procs,
144 int error_code)
145 {
146 return ORTE_ERR_NOT_IMPLEMENTED;
147 }
148
149
150 #if OPAL_ENABLE_FT_CR
151 int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
152 orte_proc_t *proc,
153 opal_list_t *local_snapshots)
154 {
155 int exit_status = ORTE_SUCCESS;
156 opal_list_item_t *item = NULL;
157 orte_std_cntr_t i_app;
158 int argc = 0;
159 orte_app_context_t *cur_app_context = NULL;
160 orte_app_context_t *new_app_context = NULL;
161 orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
162 char *reference_fmt_str = NULL;
163 char *location_str = NULL;
164 char *cache_location_str = NULL;
165 char *ref_location_fmt_str = NULL;
166 char *tmp_str = NULL;
167 char *global_snapshot_ref = NULL;
168 char *global_snapshot_seq = NULL;
169 char *sload;
170
171
172
173
174
175 for(item = opal_list_get_first(local_snapshots);
176 item != opal_list_get_end(local_snapshots);
177 item = opal_list_get_next(item) ) {
178 vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
179 if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
180 &vpid_snapshot->process_name,
181 &proc->name) ) {
182 break;
183 }
184 else {
185 vpid_snapshot = NULL;
186 }
187 }
188
189 if( NULL == vpid_snapshot ) {
190 ORTE_ERROR_LOG(ORTE_ERROR);
191 exit_status = ORTE_ERROR;
192 goto cleanup;
193 }
194
195 orte_sstore.get_attr(vpid_snapshot->ss_handle,
196 SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
197 &reference_fmt_str);
198 orte_sstore.get_attr(vpid_snapshot->ss_handle,
199 SSTORE_METADATA_LOCAL_SNAP_LOC,
200 &location_str);
201 orte_sstore.get_attr(vpid_snapshot->ss_handle,
202 SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
203 &ref_location_fmt_str);
204 orte_sstore.get_attr(vpid_snapshot->ss_handle,
205 SSTORE_METADATA_GLOBAL_SNAP_REF,
206 &global_snapshot_ref);
207 orte_sstore.get_attr(vpid_snapshot->ss_handle,
208 SSTORE_METADATA_GLOBAL_SNAP_SEQ,
209 &global_snapshot_seq);
210
211
212
213
214 cur_app_context = NULL;
215 for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) {
216 cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps,
217 i_app);
218 if( NULL == cur_app_context ) {
219 continue;
220 }
221 if(proc->app_idx == cur_app_context->idx) {
222 break;
223 }
224 }
225
226 if( NULL == cur_app_context ) {
227 ORTE_ERROR_LOG(ORTE_ERROR);
228 exit_status = ORTE_ERROR;
229 goto cleanup;
230 }
231
232
233
234
235
236
237
238
239
240
241 if( cur_app_context->num_procs > 1 ) {
242
243
244 opal_dss.copy((void**)&new_app_context, cur_app_context, ORTE_APP_CONTEXT);
245
246
247 new_app_context->idx = cur_app_context->idx;
248 free(new_app_context->app);
249 new_app_context->app = NULL;
250 new_app_context->num_procs = 1;
251 opal_argv_free(new_app_context->argv);
252 new_app_context->argv = NULL;
253
254 orte_remove_attribute(&new_app_context->attributes, ORTE_APP_PRELOAD_BIN);
255
256 opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
257 opal_asprintf(&sload,
258 "%s:%s:%s:%s:%s:%s",
259 location_str,
260 global_snapshot_ref,
261 tmp_str,
262 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
263 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
264 global_snapshot_seq);
265 orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
266 free(sload);
267
268
269
270 new_app_context->idx = (jobdata->num_apps);
271 proc->app_idx = new_app_context->idx;
272
273 opal_pointer_array_add(jobdata->apps, new_app_context);
274 ++(jobdata->num_apps);
275
276
277 --(cur_app_context->num_procs);
278 }
279 else {
280 new_app_context = cur_app_context;
281
282
283 free(new_app_context->app);
284 new_app_context->app = NULL;
285
286 opal_argv_free(new_app_context->argv);
287 new_app_context->argv = NULL;
288
289 opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
290 opal_asprintf(&sload,
291 "%s:%s:%s:%s:%s:%s",
292 location_str,
293 global_snapshot_ref,
294 tmp_str,
295 (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
296 (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
297 global_snapshot_seq);
298 orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
299 free(sload);
300 }
301
302
303
304
305 new_app_context->app = strdup("opal-restart");
306 opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app);
307 opal_argv_append(&argc, &(new_app_context->argv), "-l");
308 opal_argv_append(&argc, &(new_app_context->argv), location_str);
309 opal_argv_append(&argc, &(new_app_context->argv), "-m");
310 opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename);
311 opal_argv_append(&argc, &(new_app_context->argv), "-r");
312 if( NULL != tmp_str ) {
313 free(tmp_str);
314 tmp_str = NULL;
315 }
316 opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
317 opal_argv_append(&argc, &(new_app_context->argv), tmp_str);
318
319 cleanup:
320 if( NULL != tmp_str) {
321 free(tmp_str);
322 tmp_str = NULL;
323 }
324 if( NULL != location_str ) {
325 free(location_str);
326 location_str = NULL;
327 }
328 if( NULL != cache_location_str ) {
329 free(cache_location_str);
330 cache_location_str = NULL;
331 }
332 if( NULL != reference_fmt_str ) {
333 free(reference_fmt_str);
334 reference_fmt_str = NULL;
335 }
336 if( NULL != ref_location_fmt_str ) {
337 free(ref_location_fmt_str);
338 ref_location_fmt_str = NULL;
339 }
340
341 return exit_status;
342 }
343 #endif
344
345 #if OPAL_ENABLE_FT_CR
346 int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int seq_num)
347 {
348 int ret, exit_status = ORTE_SUCCESS;
349 orte_process_name_t loc_proc;
350 orte_job_t *jdata;
351 orte_sstore_base_handle_t prev_sstore_handle = ORTE_SSTORE_HANDLE_INVALID;
352
353
354
355
356
357
358 prev_sstore_handle = orte_sstore_handle_last_stable;
359 if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&orte_sstore_handle_last_stable,
360 NULL,
361 global_handle,
362 seq_num,
363 NULL)) ) {
364 ORTE_ERROR_LOG(ret);
365 goto cleanup;
366 }
367
368
369 if (NULL == (jdata = orte_get_job_data_object(jobid))) {
370 exit_status = ORTE_ERR_NOT_FOUND;
371 ORTE_ERROR_LOG(exit_status);
372 goto cleanup;
373 }
374
375
376
377
378 orte_snapc_base_has_recovered = false;
379 loc_proc.jobid = jobid;
380 loc_proc.vpid = 0;
381 ORTE_ACTIVATE_PROC_STATE(&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD);
382 ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FT_RESTART);
383 while( !orte_snapc_base_has_recovered ) {
384 opal_progress();
385 }
386 orte_sstore_handle_last_stable = prev_sstore_handle;
387
388 cleanup:
389 return exit_status;
390 }
391
392 int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_t *datum)
393 {
394 int ret, exit_status = ORTE_SUCCESS;
395 int i;
396 opal_list_t *proc_list = NULL;
397 opal_list_t *node_list = NULL;
398 opal_list_t *suggested_map_list = NULL;
399 orte_errmgr_predicted_map_t *onto_map = NULL;
400 #if 0
401 orte_errmgr_predicted_proc_t *off_proc = NULL;
402 orte_errmgr_predicted_node_t *off_node = NULL;
403 #endif
404
405 proc_list = OBJ_NEW(opal_list_t);
406 node_list = OBJ_NEW(opal_list_t);
407 suggested_map_list = OBJ_NEW(opal_list_t);
408
409 for( i = 0; i < datum->mig_num; ++i ) {
410
411
412
413
414 onto_map = OBJ_NEW(orte_errmgr_predicted_map_t);
415
416 if( (datum->mig_off_node)[i] ) {
417 onto_map->off_current_node = true;
418 } else {
419 onto_map->off_current_node = false;
420 }
421
422
423 onto_map->proc_name.jobid = jobid;
424 onto_map->proc_name.vpid = (datum->mig_vpids)[i];
425
426
427 onto_map->map_proc_name.jobid = jobid;
428 onto_map->map_proc_name.vpid = (datum->mig_vpid_pref)[i];
429
430 if( ((datum->mig_host_pref)[i])[0] == '\0') {
431 onto_map->map_node_name = NULL;
432 } else {
433 onto_map->map_node_name = strdup((datum->mig_host_pref)[i]);
434 }
435
436 opal_list_append(suggested_map_list, &(onto_map->super));
437 }
438
439 if( ORTE_SUCCESS != (ret = orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list)) ) {
440 ORTE_ERROR_LOG(ret);
441 exit_status = ret;
442 goto cleanup;
443 }
444
445 cleanup:
446 return exit_status;
447 }
448
449 #endif
450
451
452
453