This source file includes following definitions.
- orte_errmgr_base_log
- orte_errmgr_base_abort
- orte_errmgr_base_abort_peers
- orte_errmgr_base_update_app_context_for_cr_recovery
- orte_errmgr_base_restart_job
- orte_errmgr_base_migrate_job
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 
  12 
  13 
  14 
  15 
  16 
  17 
  18 
  19 
  20 
  21 
  22 
  23 
  24 
  25 
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <string.h>
  31 #if HAVE_SYS_TYPES_H
  32 #include <sys/types.h>
  33 #endif  
  34 #ifdef HAVE_UNISTD_H
  35 #include <unistd.h>
  36 #endif  
  37 #if HAVE_SYS_TYPES_H
  38 #include <sys/types.h>
  39 #endif 
  40 #if HAVE_SYS_STAT_H
  41 #include <sys/stat.h>
  42 #endif 
  43 #ifdef HAVE_DIRENT_H
  44 #include <dirent.h>
  45 #endif 
  46 #include <time.h>
  47 
  48 #include <stdlib.h>
  49 #include <stdarg.h>
  50 
  51 #include "orte/mca/mca.h"
  52 #include "opal/mca/base/base.h"
  53 #include "opal/util/os_dirpath.h"
  54 #include "opal/util/output.h"
  55 #include "opal/util/printf.h"
  56 #include "opal/util/basename.h"
  57 #include "opal/util/argv.h"
  58 #include "opal/mca/crs/crs.h"
  59 #include "opal/mca/crs/base/base.h"
  60 
  61 #include "orte/util/name_fns.h"
  62 #include "orte/util/session_dir.h"
  63 #include "orte/util/proc_info.h"
  64 
  65 #include "orte/runtime/orte_globals.h"
  66 #include "orte/runtime/runtime.h"
  67 #include "orte/runtime/orte_wait.h"
  68 #include "orte/runtime/orte_locks.h"
  69 
  70 #include "orte/mca/ess/ess.h"
  71 #include "orte/mca/state/state.h"
  72 #include "orte/mca/odls/odls.h"
  73 #include "orte/mca/plm/plm.h"
  74 #include "orte/mca/rml/rml.h"
  75 #include "orte/mca/rml/rml_types.h"
  76 #include "orte/mca/routed/routed.h"
  77 #include "orte/mca/snapc/snapc.h"
  78 #include "orte/mca/snapc/base/base.h"
  79 #include "orte/mca/sstore/sstore.h"
  80 #include "orte/mca/sstore/base/base.h"
  81 
  82 #include "orte/mca/errmgr/errmgr.h"
  83 #include "orte/mca/errmgr/base/base.h"
  84 #include "orte/mca/errmgr/base/errmgr_private.h"
  85 
  86 
  87 
  88 
  89 void orte_errmgr_base_log(int error_code, char *filename, int line)
  90 {
  91     char *errstring = NULL;
  92 
  93     errstring = (char*)ORTE_ERROR_NAME(error_code);
  94 
  95     if (NULL == errstring) {
  96         
  97         return;
  98     }
  99 
 100     opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
 101                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 102                 errstring, filename, line);
 103 }
 104 
 105 void orte_errmgr_base_abort(int error_code, char *fmt, ...)
 106 {
 107     va_list arglist;
 108 
 109     
 110     va_start(arglist, fmt);
 111     if( NULL != fmt ) {
 112         char* buffer = NULL;
 113         opal_vasprintf( &buffer, fmt, arglist );
 114         opal_output( 0, "%s", buffer );
 115         free( buffer );
 116     }
 117     va_end(arglist);
 118 
 119     
 120     if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
 121         
 122         orte_odls.kill_local_procs(NULL);
 123         
 124         orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
 125     }
 126 
 127     
 128     if (ORTE_ERR_CONNECTION_FAILED == error_code ||
 129         ORTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
 130         orte_ess.abort(error_code, false);
 131     } else {
 132         orte_ess.abort(error_code, true);
 133     }
 134 
 135     
 136 
 137 
 138 
 139     
 140 }
 141 
 142 int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
 143                                  orte_std_cntr_t num_procs,
 144                                  int error_code)
 145 {
 146     return ORTE_ERR_NOT_IMPLEMENTED;
 147 }
 148 
 149 
 150 #if OPAL_ENABLE_FT_CR
 151 int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
 152                                                         orte_proc_t *proc,
 153                                                         opal_list_t *local_snapshots)
 154 {
 155     int exit_status = ORTE_SUCCESS;
 156     opal_list_item_t *item = NULL;
 157     orte_std_cntr_t i_app;
 158     int argc = 0;
 159     orte_app_context_t *cur_app_context = NULL;
 160     orte_app_context_t *new_app_context = NULL;
 161     orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
 162     char *reference_fmt_str = NULL;
 163     char *location_str = NULL;
 164     char *cache_location_str = NULL;
 165     char *ref_location_fmt_str = NULL;
 166     char *tmp_str = NULL;
 167     char *global_snapshot_ref = NULL;
 168     char *global_snapshot_seq = NULL;
 169     char *sload;
 170 
 171     
 172 
 173 
 174 
 175     for(item  = opal_list_get_first(local_snapshots);
 176         item != opal_list_get_end(local_snapshots);
 177         item  = opal_list_get_next(item) ) {
 178         vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
 179         if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
 180                                                        &vpid_snapshot->process_name,
 181                                                        &proc->name) ) {
 182             break;
 183         }
 184         else {
 185             vpid_snapshot = NULL;
 186         }
 187     }
 188 
 189     if( NULL == vpid_snapshot ) {
 190         ORTE_ERROR_LOG(ORTE_ERROR);
 191         exit_status = ORTE_ERROR;
 192         goto cleanup;
 193     }
 194 
 195     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 196                          SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
 197                          &reference_fmt_str);
 198     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 199                          SSTORE_METADATA_LOCAL_SNAP_LOC,
 200                          &location_str);
 201     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 202                          SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
 203                          &ref_location_fmt_str);
 204     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 205                          SSTORE_METADATA_GLOBAL_SNAP_REF,
 206                          &global_snapshot_ref);
 207     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 208                          SSTORE_METADATA_GLOBAL_SNAP_SEQ,
 209                          &global_snapshot_seq);
 210 
 211     
 212 
 213 
 214     cur_app_context = NULL;
 215     for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) {
 216         cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps,
 217                                                                             i_app);
 218         if( NULL == cur_app_context ) {
 219             continue;
 220         }
 221         if(proc->app_idx == cur_app_context->idx) {
 222             break;
 223         }
 224     }
 225 
 226     if( NULL == cur_app_context ) {
 227         ORTE_ERROR_LOG(ORTE_ERROR);
 228         exit_status = ORTE_ERROR;
 229         goto cleanup;
 230     }
 231 
 232     
 233 
 234 
 235 
 236 
 237 
 238 
 239 
 240 
 241     if( cur_app_context->num_procs > 1 ) {
 242 
 243         
 244         opal_dss.copy((void**)&new_app_context, cur_app_context, ORTE_APP_CONTEXT);
 245 
 246         
 247         new_app_context->idx                    = cur_app_context->idx;
 248         free(new_app_context->app);
 249         new_app_context->app                    = NULL;
 250         new_app_context->num_procs              = 1;
 251         opal_argv_free(new_app_context->argv);
 252         new_app_context->argv                   = NULL;
 253 
 254         orte_remove_attribute(&new_app_context->attributes, ORTE_APP_PRELOAD_BIN);
 255 
 256         opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
 257         opal_asprintf(&sload,
 258                  "%s:%s:%s:%s:%s:%s",
 259                  location_str,
 260                  global_snapshot_ref,
 261                  tmp_str,
 262                  (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
 263                  (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
 264                  global_snapshot_seq);
 265         orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
 266         free(sload);
 267 
 268         
 269         
 270         new_app_context->idx = (jobdata->num_apps);
 271         proc->app_idx = new_app_context->idx;
 272 
 273         opal_pointer_array_add(jobdata->apps, new_app_context);
 274         ++(jobdata->num_apps);
 275 
 276         
 277         --(cur_app_context->num_procs);
 278     }
 279     else {
 280         new_app_context = cur_app_context;
 281 
 282         
 283         free(new_app_context->app);
 284         new_app_context->app = NULL;
 285 
 286         opal_argv_free(new_app_context->argv);
 287         new_app_context->argv = NULL;
 288 
 289         opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
 290         opal_asprintf(&sload,
 291                  "%s:%s:%s:%s:%s:%s",
 292                  location_str,
 293                  global_snapshot_ref,
 294                  tmp_str,
 295                  (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
 296                  (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
 297                  global_snapshot_seq);
 298         orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
 299         free(sload);
 300     }
 301 
 302     
 303 
 304 
 305     new_app_context->app = strdup("opal-restart");
 306     opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app);
 307     opal_argv_append(&argc, &(new_app_context->argv), "-l");
 308     opal_argv_append(&argc, &(new_app_context->argv), location_str);
 309     opal_argv_append(&argc, &(new_app_context->argv), "-m");
 310     opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename);
 311     opal_argv_append(&argc, &(new_app_context->argv), "-r");
 312     if( NULL != tmp_str ) {
 313         free(tmp_str);
 314         tmp_str = NULL;
 315     }
 316     opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
 317     opal_argv_append(&argc, &(new_app_context->argv), tmp_str);
 318 
 319  cleanup:
 320     if( NULL != tmp_str) {
 321         free(tmp_str);
 322         tmp_str = NULL;
 323     }
 324     if( NULL != location_str ) {
 325         free(location_str);
 326         location_str = NULL;
 327     }
 328     if( NULL != cache_location_str ) {
 329         free(cache_location_str);
 330         cache_location_str = NULL;
 331     }
 332     if( NULL != reference_fmt_str ) {
 333         free(reference_fmt_str);
 334         reference_fmt_str = NULL;
 335     }
 336     if( NULL != ref_location_fmt_str ) {
 337         free(ref_location_fmt_str);
 338         ref_location_fmt_str = NULL;
 339     }
 340 
 341     return exit_status;
 342 }
 343 #endif
 344 
 345 #if OPAL_ENABLE_FT_CR
 346 int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int seq_num)
 347 {
 348     int ret, exit_status = ORTE_SUCCESS;
 349     orte_process_name_t loc_proc;
 350     orte_job_t *jdata;
 351     orte_sstore_base_handle_t prev_sstore_handle = ORTE_SSTORE_HANDLE_INVALID;
 352 
 353     
 354 
 355     
 356 
 357 
 358     prev_sstore_handle = orte_sstore_handle_last_stable;
 359     if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&orte_sstore_handle_last_stable,
 360                                                                   NULL,
 361                                                                   global_handle,
 362                                                                   seq_num,
 363                                                                   NULL)) ) {
 364         ORTE_ERROR_LOG(ret);
 365         goto cleanup;
 366     }
 367 
 368     
 369     if (NULL == (jdata = orte_get_job_data_object(jobid))) {
 370         exit_status = ORTE_ERR_NOT_FOUND;
 371         ORTE_ERROR_LOG(exit_status);
 372         goto cleanup;
 373     }
 374 
 375     
 376 
 377 
 378     orte_snapc_base_has_recovered = false;
 379     loc_proc.jobid = jobid;
 380     loc_proc.vpid  = 0;
 381     ORTE_ACTIVATE_PROC_STATE(&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD);
 382     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FT_RESTART);
 383     while( !orte_snapc_base_has_recovered ) {
 384         opal_progress();
 385     }
 386     orte_sstore_handle_last_stable = prev_sstore_handle;
 387 
 388  cleanup:
 389     return exit_status;
 390 }
 391 
 392 int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_t *datum)
 393 {
 394     int ret, exit_status = ORTE_SUCCESS;
 395     int i;
 396     opal_list_t *proc_list = NULL;
 397     opal_list_t *node_list = NULL;
 398     opal_list_t *suggested_map_list = NULL;
 399     orte_errmgr_predicted_map_t  *onto_map = NULL;
 400 #if 0
 401     orte_errmgr_predicted_proc_t *off_proc = NULL;
 402     orte_errmgr_predicted_node_t *off_node = NULL;
 403 #endif
 404 
 405     proc_list = OBJ_NEW(opal_list_t);
 406     node_list = OBJ_NEW(opal_list_t);
 407     suggested_map_list = OBJ_NEW(opal_list_t);
 408 
 409     for( i = 0; i < datum->mig_num; ++i ) {
 410         
 411 
 412 
 413 
 414         onto_map = OBJ_NEW(orte_errmgr_predicted_map_t);
 415 
 416         if( (datum->mig_off_node)[i] ) {
 417             onto_map->off_current_node = true;
 418         } else {
 419             onto_map->off_current_node = false;
 420         }
 421 
 422         
 423         onto_map->proc_name.jobid = jobid;
 424         onto_map->proc_name.vpid = (datum->mig_vpids)[i];
 425 
 426         
 427         onto_map->map_proc_name.jobid = jobid;
 428         onto_map->map_proc_name.vpid  = (datum->mig_vpid_pref)[i];
 429 
 430         if( ((datum->mig_host_pref)[i])[0] == '\0') {
 431             onto_map->map_node_name = NULL;
 432         } else {
 433             onto_map->map_node_name = strdup((datum->mig_host_pref)[i]);
 434         }
 435 
 436         opal_list_append(suggested_map_list, &(onto_map->super));
 437     }
 438 
 439     if( ORTE_SUCCESS != (ret = orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list)) ) {
 440         ORTE_ERROR_LOG(ret);
 441         exit_status = ret;
 442         goto cleanup;
 443     }
 444 
 445  cleanup:
 446     return exit_status;
 447 }
 448 
 449 #endif
 450 
 451 
 452 
 453