root/ompi/mca/bml/r2/bml_r2_ft.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. mca_bml_r2_ft_event

   1 /*
   2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
   3  *                         University Research and Technology
   4  *                         Corporation.  All rights reserved.
   5  * Copyright (c) 2004-2011 The University of Tennessee and The University
   6  *                         of Tennessee Research Foundation.  All rights
   7  *                         reserved.
   8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
   9  *                         University of Stuttgart.  All rights reserved.
  10  * Copyright (c) 2004-2006 The Regents of the University of California.
  11  *                         All rights reserved.
  12  * Copyright (c) 2007-2012 Los Alamos National Security, LLC.  All rights
  13  *                         reserved.
  14  * Copyright (c) 2008      Cisco Systems, Inc.  All rights reserved.
  15  * $COPYRIGHT$
  16  *
  17  * Additional copyrights may follow
  18  *
  19  * $HEADER$
  20  */
  21 
  22 #include "ompi_config.h"
  23 #include "opal/util/output.h"
  24 
  25 #include <stdlib.h>
  26 #include <string.h>
  27 
  28 #include "opal/runtime/opal_progress.h"
  29 #include "opal/mca/btl/base/base.h"
  30 #include "opal/mca/pmix/pmix.h"
  31 
  32 #include "ompi/runtime/ompi_cr.h"
  33 #include "ompi/mca/bml/base/base.h"
  34 #include "ompi/mca/bml/base/bml_base_btl.h"
  35 #include "ompi/mca/pml/base/base.h"
  36 #include "ompi/proc/proc.h"
  37 
  38 #include "bml_r2.h"
  39 #include "bml_r2_ft.h"
  40 
  41 int mca_bml_r2_ft_event(int state)
  42 {
  43 #if OPAL_ENABLE_FT_CR == 1
  44     static bool first_continue_pass = false;
  45     ompi_proc_t** procs = NULL;
  46     size_t num_procs;
  47     size_t btl_idx;
  48     int ret, p;
  49     int loc_state;
  50     int param_type = -1;
  51     const char **btl_list;
  52 
  53     if(OPAL_CRS_CHECKPOINT == state) {
  54         /* Do nothing for now */
  55     }
  56     else if(OPAL_CRS_CONTINUE == state) {
  57         first_continue_pass = !first_continue_pass;
  58 
  59         /* Since nothing in Checkpoint, we are fine here (unless required by BTL) */
  60         if (opal_cr_continue_like_restart && !first_continue_pass) {
  61             procs = ompi_proc_all(&num_procs);
  62             if(NULL == procs) {
  63                 return OMPI_ERR_OUT_OF_RESOURCE;
  64             }
  65         }
  66     }
  67     else if(OPAL_CRS_RESTART_PRE == state ) {
  68         /* Nothing here */
  69     }
  70     else if(OPAL_CRS_RESTART == state ) {
  71         procs = ompi_proc_all(&num_procs);
  72         if(NULL == procs) {
  73             return OMPI_ERR_OUT_OF_RESOURCE;
  74         }
  75     }
  76     else if(OPAL_CRS_TERM == state ) {
  77         ;
  78     }
  79     else {
  80         ;
  81     }
  82 
  83     /* Never call the ft_event functions attached to the BTLs on the second
  84      * pass of RESTART since on the first pass they were unloaded and therefore
  85      * no longer exist.
  86      */
  87     if( OPAL_CRS_RESTART != state ) {
  88         if( OPAL_CRS_CONTINUE == state && !first_continue_pass ) {
  89             ;
  90         } else {
  91             /* Since we only ever call into the BTLs once during the first restart
  92              * pass, just lie to them on this pass for a bit of local clarity.
  93              */
  94             if( OPAL_CRS_RESTART_PRE == state ) {
  95                 loc_state = OPAL_CRS_RESTART;
  96             } else {
  97                 loc_state = state;
  98             }
  99 
 100             /*
 101              * Call ft_event in:
 102              * - BTL modules
 103              * - MPool modules
 104              *
 105              * These should be cleaning out stale state, and memory references in
 106              * preparation for being shut down.
 107              */
 108             for(btl_idx = 0; btl_idx < mca_bml_r2.num_btl_modules; btl_idx++) {
 109                 /*
 110                  * Notify Mpool
 111                  */
 112                 if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool &&
 113                     NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event ) {
 114                     opal_output_verbose(10, ompi_cr_output,
 115                                         "bml:r2: ft_event: Notify the %s MPool.\n",
 116                                         (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_component->mpool_version.mca_component_name);
 117                     if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_mpool->mpool_ft_event(loc_state) ) ) {
 118                         continue;
 119                     }
 120                 }
 121 
 122                 /*
 123                  * Notify BTL
 124                  */
 125                 if( NULL != (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event) {
 126                     opal_output_verbose(10, ompi_cr_output,
 127                                         "bml:r2: ft_event: Notify the %s BTL.\n",
 128                                         (mca_bml_r2.btl_modules[btl_idx])->btl_component->btl_version.mca_component_name);
 129                     if(OMPI_SUCCESS != (ret = (mca_bml_r2.btl_modules[btl_idx])->btl_ft_event(loc_state) ) ) {
 130                         continue;
 131                     }
 132                 }
 133             }
 134         } /* OPAL_CRS_CONTINUE == state && !first_continue_pass */
 135     }
 136 
 137     if(OPAL_CRS_CHECKPOINT == state) {
 138         ;
 139     }
 140     else if(OPAL_CRS_CONTINUE == state) {
 141         /* Matches OPAL_CRS_RESTART_PRE */
 142         if (opal_cr_continue_like_restart && first_continue_pass) {
 143             if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
 144                 opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
 145                 return ret;
 146             }
 147             if( OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_btl_base_framework)) ) {
 148                 opal_output(0, "bml:r2: ft_event(Restart): Failed to close BTL framework\n");
 149                 return ret;
 150             }
 151         }
 152         /* Matches OPAL_CRS_RESTART */
 153         else if (opal_cr_continue_like_restart && !first_continue_pass) {
 154             /*
 155              * Barrier to make all processes have been successfully restarted before
 156              * we try to remove some restart only files.
 157              */
 158             if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
 159                 opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
 160                 return ret;
 161             }
 162 
 163             /*
 164              * Re-open the BTL framework to get the full list of components.
 165              */
 166             if( OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_btl_base_framework, 0)) ) {
 167                 opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
 168                 return ret;
 169             }
 170 
 171             /*
 172              * Re-select the BTL components/modules
 173              * This will cause the BTL components to discover the available
 174              * network options on this machine, and post proper modex informaiton.
 175              */
 176             if( OMPI_SUCCESS != (ret = mca_btl_base_select(OPAL_ENABLE_PROGRESS_THREADS, 1) ) ) {
 177                 opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n");
 178                 return ret;
 179             }
 180 
 181             /*
 182              * Clear some structures so we can properly repopulate them
 183              */
 184             mca_bml_r2.btls_added = false;
 185 
 186             for(p = 0; p < (int)num_procs; ++p) {
 187                 if( NULL != procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
 188                     OBJ_RELEASE(procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
 189                     procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
 190                 }
 191 
 192                 OBJ_RELEASE(procs[p]);
 193             }
 194 
 195             if( NULL != procs ) {
 196                 free(procs);
 197                 procs = NULL;
 198             }
 199         }
 200     }
 201     else if(OPAL_CRS_RESTART_PRE == state ) {
 202         opal_output_verbose(10, ompi_cr_output,
 203                             "bml:r2: ft_event(Restart): Finalize BML\n");
 204 
 205         /*
 206          * Finalize the BML
 207          * - Flush progress functions
 208          * - Flush module references
 209          * - mca_btl_base_close()
 210          *   Need to do this because we may have BTL components that were
 211          *   unloaded in the first selection that may be available now.
 212          *   Conversely we may have BTL components loaded now that
 213          *   are not available now.
 214          */
 215         if( OMPI_SUCCESS != (ret = mca_bml_r2_finalize()) ) {
 216             opal_output(0, "bml:r2: ft_event(Restart): Failed to finalize BML framework\n");
 217             return ret;
 218         }
 219         if( OMPI_SUCCESS != (ret = mca_base_framework_close(&opal_btl_base_framework)) ) {
 220             opal_output(0, "bml:r2: ft_event(Restart): Failed to close BTL framework\n");
 221             return ret;
 222         }
 223     }
 224     else if(OPAL_CRS_RESTART == state  ) {
 225 
 226         /*
 227          * Barrier to make all processes have been successfully restarted before
 228          * we try to remove some restart only files.
 229          */
 230         if( OMPI_SUCCESS != (ret = opal_pmix.fence(NULL, 0))) {
 231             opal_output(0, "bml:r2: ft_event(Restart): Failed to fence complete\n");
 232             return ret;
 233         }
 234 
 235         /*
 236          * Re-open the BTL framework to get the full list of components.
 237          * - but first clear the MCA value that was there
 238          */
 239         param_type = mca_base_var_find("ompi", "btl", NULL, NULL);
 240         btl_list = NULL;
 241         mca_base_var_get_value(param_type, &btl_list, NULL, NULL);
 242         opal_output_verbose(11, ompi_cr_output,
 243                             "Restart (Previous BTL MCA): <%s>\n", btl_list ? btl_list[0] : "");
 244 
 245         if( OMPI_SUCCESS != (ret = mca_base_framework_open(&opal_btl_base_framework, 0)) ) {
 246             opal_output(0, "bml:r2: ft_event(Restart): Failed to open BTL framework\n");
 247             return ret;
 248         }
 249 
 250         /* The reregistered paramter is guaranteed to have the same index */
 251         btl_list = NULL;
 252         mca_base_var_get_value(param_type, &btl_list, NULL, NULL);
 253         opal_output_verbose(11, ompi_cr_output,
 254                             "Restart (New BTL MCA): <%s>\n", btl_list ? btl_list[0] : "");
 255         if( NULL != btl_list ) {
 256             free(btl_list);
 257             btl_list = NULL;
 258         }
 259 
 260         /*
 261          * Re-select the BTL components/modules
 262          * This will cause the BTL components to discover the available
 263          * network options on this machine, and post proper modex informaiton.
 264          */
 265         if( OMPI_SUCCESS != (ret = mca_btl_base_select(OPAL_ENABLE_PROGRESS_THREADS, 1) ) ) {
 266             opal_output(0, "bml:r2: ft_event(Restart): Failed to select in BTL framework\n");
 267             return ret;
 268         }
 269 
 270         /*
 271          * Clear some structures so we can properly repopulate them
 272          */
 273         mca_bml_r2.btls_added = false;
 274 
 275         for(p = 0; p < (int)num_procs; ++p) {
 276             if( NULL != procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]) {
 277                 OBJ_RELEASE(procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML]);
 278                 procs[p]->proc_endpoints[OMPI_PROC_ENDPOINT_TAG_BML] = NULL;
 279             }
 280 
 281             OBJ_RELEASE(procs[p]);
 282         }
 283 
 284         if( NULL != procs ) {
 285             free(procs);
 286             procs = NULL;
 287         }
 288     }
 289     else if(OPAL_CRS_TERM == state ) {
 290         ;
 291     }
 292     else {
 293         ;
 294     }
 295 #endif
 296 
 297     return OMPI_SUCCESS;
 298 }

/* [<][>][^][v][top][bottom][index][help] */