root/orte/mca/errmgr/base/errmgr_base_fns.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. orte_errmgr_base_log
  2. orte_errmgr_base_abort
  3. orte_errmgr_base_abort_peers
  4. orte_errmgr_base_update_app_context_for_cr_recovery
  5. orte_errmgr_base_restart_job
  6. orte_errmgr_base_migrate_job

   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2005 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2010      Cisco Systems, Inc.  All rights reserved.
  13  * Copyright (c) 2010-2011 Oak Ridge National Labs.  All rights reserved.
  14  * Copyright (c) 2011-2013 Los Alamos National Security, LLC.
  15  *                         All rights reserved.
  16  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
  17  * Copyright (c) 2014      Research Organization for Information Science
  18  *                         and Technology (RIST). All rights reserved.
  19  * $COPYRIGHT$
  20  *
  21  * Additional copyrights may follow
  22  *
  23  * $HEADER$
  24  */
  25 
  26 
  27 #include "orte_config.h"
  28 #include "orte/constants.h"
  29 
  30 #include <string.h>
  31 #if HAVE_SYS_TYPES_H
  32 #include <sys/types.h>
  33 #endif  /* HAVE_SYS_TYPES_H */
  34 #ifdef HAVE_UNISTD_H
  35 #include <unistd.h>
  36 #endif  /* HAVE_UNISTD_H */
  37 #if HAVE_SYS_TYPES_H
  38 #include <sys/types.h>
  39 #endif /* HAVE_SYS_TYPES_H */
  40 #if HAVE_SYS_STAT_H
  41 #include <sys/stat.h>
  42 #endif /* HAVE_SYS_STAT_H */
  43 #ifdef HAVE_DIRENT_H
  44 #include <dirent.h>
  45 #endif /* HAVE_DIRENT_H */
  46 #include <time.h>
  47 
  48 #include <stdlib.h>
  49 #include <stdarg.h>
  50 
  51 #include "orte/mca/mca.h"
  52 #include "opal/mca/base/base.h"
  53 #include "opal/util/os_dirpath.h"
  54 #include "opal/util/output.h"
  55 #include "opal/util/printf.h"
  56 #include "opal/util/basename.h"
  57 #include "opal/util/argv.h"
  58 #include "opal/mca/crs/crs.h"
  59 #include "opal/mca/crs/base/base.h"
  60 
  61 #include "orte/util/name_fns.h"
  62 #include "orte/util/session_dir.h"
  63 #include "orte/util/proc_info.h"
  64 
  65 #include "orte/runtime/orte_globals.h"
  66 #include "orte/runtime/runtime.h"
  67 #include "orte/runtime/orte_wait.h"
  68 #include "orte/runtime/orte_locks.h"
  69 
  70 #include "orte/mca/ess/ess.h"
  71 #include "orte/mca/state/state.h"
  72 #include "orte/mca/odls/odls.h"
  73 #include "orte/mca/plm/plm.h"
  74 #include "orte/mca/rml/rml.h"
  75 #include "orte/mca/rml/rml_types.h"
  76 #include "orte/mca/routed/routed.h"
  77 #include "orte/mca/snapc/snapc.h"
  78 #include "orte/mca/snapc/base/base.h"
  79 #include "orte/mca/sstore/sstore.h"
  80 #include "orte/mca/sstore/base/base.h"
  81 
  82 #include "orte/mca/errmgr/errmgr.h"
  83 #include "orte/mca/errmgr/base/base.h"
  84 #include "orte/mca/errmgr/base/errmgr_private.h"
  85 
  86 /*
  87  * Public interfaces
  88  */
  89 void orte_errmgr_base_log(int error_code, char *filename, int line)
  90 {
  91     char *errstring = NULL;
  92 
  93     errstring = (char*)ORTE_ERROR_NAME(error_code);
  94 
  95     if (NULL == errstring) {
  96         /* if the error is silent, say nothing */
  97         return;
  98     }
  99 
 100     opal_output(0, "%s ORTE_ERROR_LOG: %s in file %s at line %d",
 101                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
 102                 errstring, filename, line);
 103 }
 104 
 105 void orte_errmgr_base_abort(int error_code, char *fmt, ...)
 106 {
 107     va_list arglist;
 108 
 109     /* If there was a message, output it */
 110     va_start(arglist, fmt);
 111     if( NULL != fmt ) {
 112         char* buffer = NULL;
 113         opal_vasprintf( &buffer, fmt, arglist );
 114         opal_output( 0, "%s", buffer );
 115         free( buffer );
 116     }
 117     va_end(arglist);
 118 
 119     /* if I am a daemon or the HNP... */
 120     if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) {
 121         /* whack my local procs */
 122         orte_odls.kill_local_procs(NULL);
 123         /* whack any session directories */
 124         orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
 125     }
 126 
 127     /* if a critical connection failed, or a sensor limit was exceeded, exit without dropping a core */
 128     if (ORTE_ERR_CONNECTION_FAILED == error_code ||
 129         ORTE_ERR_SENSOR_LIMIT_EXCEEDED == error_code) {
 130         orte_ess.abort(error_code, false);
 131     } else {
 132         orte_ess.abort(error_code, true);
 133     }
 134 
 135     /*
 136      * We must exit in orte_ess.abort; all implementations of orte_ess.abort
 137      * contain __opal_attribute_noreturn__
 138      */
 139     /* No way to reach here */
 140 }
 141 
 142 int orte_errmgr_base_abort_peers(orte_process_name_t *procs,
 143                                  orte_std_cntr_t num_procs,
 144                                  int error_code)
 145 {
 146     return ORTE_ERR_NOT_IMPLEMENTED;
 147 }
 148 
 149 
 150 #if OPAL_ENABLE_FT_CR
 151 int orte_errmgr_base_update_app_context_for_cr_recovery(orte_job_t *jobdata,
 152                                                         orte_proc_t *proc,
 153                                                         opal_list_t *local_snapshots)
 154 {
 155     int exit_status = ORTE_SUCCESS;
 156     opal_list_item_t *item = NULL;
 157     orte_std_cntr_t i_app;
 158     int argc = 0;
 159     orte_app_context_t *cur_app_context = NULL;
 160     orte_app_context_t *new_app_context = NULL;
 161     orte_sstore_base_local_snapshot_info_t *vpid_snapshot = NULL;
 162     char *reference_fmt_str = NULL;
 163     char *location_str = NULL;
 164     char *cache_location_str = NULL;
 165     char *ref_location_fmt_str = NULL;
 166     char *tmp_str = NULL;
 167     char *global_snapshot_ref = NULL;
 168     char *global_snapshot_seq = NULL;
 169     char *sload;
 170 
 171     /*
 172      * Get the snapshot restart command for this process
 173      * JJH CLEANUP: Pass in the vpid_snapshot, so we don't have to look it up every time?
 174      */
 175     for(item  = opal_list_get_first(local_snapshots);
 176         item != opal_list_get_end(local_snapshots);
 177         item  = opal_list_get_next(item) ) {
 178         vpid_snapshot = (orte_sstore_base_local_snapshot_info_t*)item;
 179         if(OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL,
 180                                                        &vpid_snapshot->process_name,
 181                                                        &proc->name) ) {
 182             break;
 183         }
 184         else {
 185             vpid_snapshot = NULL;
 186         }
 187     }
 188 
 189     if( NULL == vpid_snapshot ) {
 190         ORTE_ERROR_LOG(ORTE_ERROR);
 191         exit_status = ORTE_ERROR;
 192         goto cleanup;
 193     }
 194 
 195     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 196                          SSTORE_METADATA_LOCAL_SNAP_REF_FMT,
 197                          &reference_fmt_str);
 198     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 199                          SSTORE_METADATA_LOCAL_SNAP_LOC,
 200                          &location_str);
 201     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 202                          SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT,
 203                          &ref_location_fmt_str);
 204     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 205                          SSTORE_METADATA_GLOBAL_SNAP_REF,
 206                          &global_snapshot_ref);
 207     orte_sstore.get_attr(vpid_snapshot->ss_handle,
 208                          SSTORE_METADATA_GLOBAL_SNAP_SEQ,
 209                          &global_snapshot_seq);
 210 
 211     /*
 212      * Find current app_context
 213      */
 214     cur_app_context = NULL;
 215     for(i_app = 0; i_app < opal_pointer_array_get_size(jobdata->apps); ++i_app) {
 216         cur_app_context = (orte_app_context_t *)opal_pointer_array_get_item(jobdata->apps,
 217                                                                             i_app);
 218         if( NULL == cur_app_context ) {
 219             continue;
 220         }
 221         if(proc->app_idx == cur_app_context->idx) {
 222             break;
 223         }
 224     }
 225 
 226     if( NULL == cur_app_context ) {
 227         ORTE_ERROR_LOG(ORTE_ERROR);
 228         exit_status = ORTE_ERROR;
 229         goto cleanup;
 230     }
 231 
 232     /*
 233      * if > 1 processes in this app context
 234      *   Create a new app_context
 235      *   Copy over attributes
 236      *   Add it to the job_t data structure
 237      *   Associate it with this process in the job
 238      * else
 239      *   Reuse this app_context
 240      */
 241     if( cur_app_context->num_procs > 1 ) {
 242 
 243         /* Create a new app_context */
 244         opal_dss.copy((void**)&new_app_context, cur_app_context, ORTE_APP_CONTEXT);
 245 
 246         /* clear unused attributes */
 247         new_app_context->idx                    = cur_app_context->idx;
 248         free(new_app_context->app);
 249         new_app_context->app                    = NULL;
 250         new_app_context->num_procs              = 1;
 251         opal_argv_free(new_app_context->argv);
 252         new_app_context->argv                   = NULL;
 253 
 254         orte_remove_attribute(&new_app_context->attributes, ORTE_APP_PRELOAD_BIN);
 255 
 256         opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
 257         opal_asprintf(&sload,
 258                  "%s:%s:%s:%s:%s:%s",
 259                  location_str,
 260                  global_snapshot_ref,
 261                  tmp_str,
 262                  (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
 263                  (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
 264                  global_snapshot_seq);
 265         orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
 266         free(sload);
 267 
 268         /* Add it to the job_t data structure */
 269         /*current_global_jobdata->num_apps++; */
 270         new_app_context->idx = (jobdata->num_apps);
 271         proc->app_idx = new_app_context->idx;
 272 
 273         opal_pointer_array_add(jobdata->apps, new_app_context);
 274         ++(jobdata->num_apps);
 275 
 276         /* Remove association with the old app_context */
 277         --(cur_app_context->num_procs);
 278     }
 279     else {
 280         new_app_context = cur_app_context;
 281 
 282         /* Cleanout old stuff */
 283         free(new_app_context->app);
 284         new_app_context->app = NULL;
 285 
 286         opal_argv_free(new_app_context->argv);
 287         new_app_context->argv = NULL;
 288 
 289         opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
 290         opal_asprintf(&sload,
 291                  "%s:%s:%s:%s:%s:%s",
 292                  location_str,
 293                  global_snapshot_ref,
 294                  tmp_str,
 295                  (vpid_snapshot->compress_comp == NULL ? "" : vpid_snapshot->compress_comp),
 296                  (vpid_snapshot->compress_postfix == NULL ? "" : vpid_snapshot->compress_postfix),
 297                  global_snapshot_seq);
 298         orte_set_attribute(&new_app_context->attributes, ORTE_APP_SSTORE_LOAD, ORTE_ATTR_LOCAL, sload, OPAL_STRING);
 299         free(sload);
 300     }
 301 
 302     /*
 303      * Update the app_context with the restart informaiton
 304      */
 305     new_app_context->app = strdup("opal-restart");
 306     opal_argv_append(&argc, &(new_app_context->argv), new_app_context->app);
 307     opal_argv_append(&argc, &(new_app_context->argv), "-l");
 308     opal_argv_append(&argc, &(new_app_context->argv), location_str);
 309     opal_argv_append(&argc, &(new_app_context->argv), "-m");
 310     opal_argv_append(&argc, &(new_app_context->argv), orte_sstore_base_local_metadata_filename);
 311     opal_argv_append(&argc, &(new_app_context->argv), "-r");
 312     if( NULL != tmp_str ) {
 313         free(tmp_str);
 314         tmp_str = NULL;
 315     }
 316     opal_asprintf(&tmp_str, reference_fmt_str, vpid_snapshot->process_name.vpid);
 317     opal_argv_append(&argc, &(new_app_context->argv), tmp_str);
 318 
 319  cleanup:
 320     if( NULL != tmp_str) {
 321         free(tmp_str);
 322         tmp_str = NULL;
 323     }
 324     if( NULL != location_str ) {
 325         free(location_str);
 326         location_str = NULL;
 327     }
 328     if( NULL != cache_location_str ) {
 329         free(cache_location_str);
 330         cache_location_str = NULL;
 331     }
 332     if( NULL != reference_fmt_str ) {
 333         free(reference_fmt_str);
 334         reference_fmt_str = NULL;
 335     }
 336     if( NULL != ref_location_fmt_str ) {
 337         free(ref_location_fmt_str);
 338         ref_location_fmt_str = NULL;
 339     }
 340 
 341     return exit_status;
 342 }
 343 #endif
 344 
 345 #if OPAL_ENABLE_FT_CR
 346 int orte_errmgr_base_restart_job(orte_jobid_t jobid, char * global_handle, int seq_num)
 347 {
 348     int ret, exit_status = ORTE_SUCCESS;
 349     orte_process_name_t loc_proc;
 350     orte_job_t *jdata;
 351     orte_sstore_base_handle_t prev_sstore_handle = ORTE_SSTORE_HANDLE_INVALID;
 352 
 353     /* JJH First determine if we can recover this way */
 354 
 355     /*
 356      * Find the corresponding sstore handle
 357      */
 358     prev_sstore_handle = orte_sstore_handle_last_stable;
 359     if( ORTE_SUCCESS != (ret = orte_sstore.request_restart_handle(&orte_sstore_handle_last_stable,
 360                                                                   NULL,
 361                                                                   global_handle,
 362                                                                   seq_num,
 363                                                                   NULL)) ) {
 364         ORTE_ERROR_LOG(ret);
 365         goto cleanup;
 366     }
 367 
 368     /* get the job object */
 369     if (NULL == (jdata = orte_get_job_data_object(jobid))) {
 370         exit_status = ORTE_ERR_NOT_FOUND;
 371         ORTE_ERROR_LOG(exit_status);
 372         goto cleanup;
 373     }
 374 
 375     /*
 376      * Start the recovery
 377      */
 378     orte_snapc_base_has_recovered = false;
 379     loc_proc.jobid = jobid;
 380     loc_proc.vpid  = 0;
 381     ORTE_ACTIVATE_PROC_STATE(&loc_proc, ORTE_PROC_STATE_KILLED_BY_CMD);
 382     ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_FT_RESTART);
 383     while( !orte_snapc_base_has_recovered ) {
 384         opal_progress();
 385     }
 386     orte_sstore_handle_last_stable = prev_sstore_handle;
 387 
 388  cleanup:
 389     return exit_status;
 390 }
 391 
 392 int orte_errmgr_base_migrate_job(orte_jobid_t jobid, orte_snapc_base_request_op_t *datum)
 393 {
 394     int ret, exit_status = ORTE_SUCCESS;
 395     int i;
 396     opal_list_t *proc_list = NULL;
 397     opal_list_t *node_list = NULL;
 398     opal_list_t *suggested_map_list = NULL;
 399     orte_errmgr_predicted_map_t  *onto_map = NULL;
 400 #if 0
 401     orte_errmgr_predicted_proc_t *off_proc = NULL;
 402     orte_errmgr_predicted_node_t *off_node = NULL;
 403 #endif
 404 
 405     proc_list = OBJ_NEW(opal_list_t);
 406     node_list = OBJ_NEW(opal_list_t);
 407     suggested_map_list = OBJ_NEW(opal_list_t);
 408 
 409     for( i = 0; i < datum->mig_num; ++i ) {
 410         /*
 411          * List all processes that are included in the migration.
 412          * We will sort them out in the component.
 413          */
 414         onto_map = OBJ_NEW(orte_errmgr_predicted_map_t);
 415 
 416         if( (datum->mig_off_node)[i] ) {
 417             onto_map->off_current_node = true;
 418         } else {
 419             onto_map->off_current_node = false;
 420         }
 421 
 422         /* Who to migrate */
 423         onto_map->proc_name.jobid = jobid;
 424         onto_map->proc_name.vpid = (datum->mig_vpids)[i];
 425 
 426         /* Destination */
 427         onto_map->map_proc_name.jobid = jobid;
 428         onto_map->map_proc_name.vpid  = (datum->mig_vpid_pref)[i];
 429 
 430         if( ((datum->mig_host_pref)[i])[0] == '\0') {
 431             onto_map->map_node_name = NULL;
 432         } else {
 433             onto_map->map_node_name = strdup((datum->mig_host_pref)[i]);
 434         }
 435 
 436         opal_list_append(suggested_map_list, &(onto_map->super));
 437     }
 438 
 439     if( ORTE_SUCCESS != (ret = orte_errmgr.predicted_fault(proc_list, node_list, suggested_map_list)) ) {
 440         ORTE_ERROR_LOG(ret);
 441         exit_status = ret;
 442         goto cleanup;
 443     }
 444 
 445  cleanup:
 446     return exit_status;
 447 }
 448 
 449 #endif
 450 
 451 /********************
 452  * Local Functions
 453  ********************/

/* [<][>][^][v][top][bottom][index][help] */