root/orte/mca/sstore/sstore.h

/* [<][>][^][v][top][bottom][index][help] */

INCLUDED FROM


   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
   2 /*
   3  * Copyright (c)      2010 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  * Copyright (c) 2015      Los Alamos National Security, LLC. All rights
   6  *                         reserved.
   7  *                         Corporation.  All rights reserved.
   8  * $COPYRIGHT$
   9  *
  10  * Additional copyrights may follow
  11  *
  12  * $HEADER$
  13  */
  14 /**
  15  * @file
  16  *
  17  * Distributed Stable Storage (SStore) Interface
  18  *
  19  */
  20 
  21 #ifndef MCA_SSTORE_H
  22 #define MCA_SSTORE_H
  23 
  24 #include "orte_config.h"
  25 #include "orte/constants.h"
  26 #include "orte/types.h"
  27 #include "orte/runtime/orte_globals.h"
  28 
  29 #include "orte/mca/mca.h"
  30 #include "opal/mca/base/base.h"
  31 
  32 #include "opal/class/opal_object.h"
  33 
  34 BEGIN_C_DECLS
  35 
  36 /**
  37  * Keys accepted as metadata
  38  */
  39 typedef uint32_t orte_sstore_base_key_t;
  40 /** CRS Component */
  41 #define SSTORE_METADATA_LOCAL_CRS_COMP          0
  42 /** Compress Component */
  43 #define SSTORE_METADATA_LOCAL_COMPRESS_COMP     1
  44 /** Compress Component Postfix */
  45 #define SSTORE_METADATA_LOCAL_COMPRESS_POSTFIX  2
  46 /** Process PID */
  47 #define SSTORE_METADATA_LOCAL_PID               3
  48 /** Checkpoint Context File */
  49 #define SSTORE_METADATA_LOCAL_CONTEXT           4
  50 /** Directory to make on restart */
  51 #define SSTORE_METADATA_LOCAL_MKDIR             5
  52 /** File to touch on restart */
  53 #define SSTORE_METADATA_LOCAL_TOUCH             6
  54 
  55 /** Local snapshot reference (e.g., opal_snapshot_0.ckpt) */
  56 #define SSTORE_METADATA_LOCAL_SNAP_REF          7
  57 /** Local snapshot reference format string (e.g., opal_snapshot_%d.ckpt) passed vpid */
  58 #define SSTORE_METADATA_LOCAL_SNAP_REF_FMT      8
  59 /** Local snapshot directory (Full Path excluding reference) */
  60 #define SSTORE_METADATA_LOCAL_SNAP_LOC          9
  61 /** Local snapshot reference directory (Full Path) */
  62 #define SSTORE_METADATA_LOCAL_SNAP_REF_LOC_FMT 10
  63 /** Local snapshot metadata file (Full Path) */
  64 #define SSTORE_METADATA_LOCAL_SNAP_META        11
  65 
  66 /** Global snapshot reference (e.g., ompi_global_snapshot_1234.ckpt) */
  67 #define SSTORE_METADATA_GLOBAL_SNAP_REF        12
  68 /** Global snapshot location (Relative Path from base) */
  69 #define SSTORE_METADATA_GLOBAL_SNAP_LOC        13
  70 /** Global snapshot location (Full path) */
  71 #define SSTORE_METADATA_GLOBAL_SNAP_LOC_ABS    14
  72 /** Global snapshot metadata file (Full path) */
  73 #define SSTORE_METADATA_GLOBAL_SNAP_META       15
  74 /** Global snapshot sequence number */
  75 #define SSTORE_METADATA_GLOBAL_SNAP_SEQ        16
  76 /** AMCA Parameter to be preserved for ompi-restart */
  77 #define SSTORE_METADATA_GLOBAL_AMCA_PARAM      17
  78 
  79 /** Total number of sequence numbers for this snapshot */
  80 #define SSTORE_METADATA_GLOBAL_SNAP_NUM_SEQ    18
  81 /** Comma separated list of all sequence numbers for this snapshot */
  82 #define SSTORE_METADATA_GLOBAL_SNAP_ALL_SEQ    19
  83 
  84 /** Access the current default base directory (Full Path) */
  85 #define SSTORE_METADATA_BASE_LOC               20
  86 
  87 /** The local process is skipping the checkpoint
  88  * Usually this is because there is a migration, and it is not participating
  89  */
  90 #define SSTORE_METADATA_LOCAL_SKIP_CKPT        21
  91 
  92 /** A Migration checkpoint does not necessarily contain all of the processes
  93  * in the job, so it is not a checkpoint that can be restarted from normally.
  94  * Therefore, it needs to be marked specially. */
  95 #define SSTORE_METADATA_GLOBAL_MIGRATING       22
  96 
  97 /** TUNE Parameter to be preserved for ompi-restart */
  98 #define SSTORE_METADATA_GLOBAL_TUNE_PARAM      23
  99 
 100 /** */
 101 #define SSTORE_METADATA_MAX                    24
 102 
 103 /**
 104  * Storage handle
 105  */
 106 #define ORTE_SSTORE_HANDLE OPAL_UINT32
 107 typedef uint32_t orte_sstore_base_handle_t;
 108 ORTE_DECLSPEC extern orte_sstore_base_handle_t orte_sstore_handle_current;
 109 ORTE_DECLSPEC extern orte_sstore_base_handle_t orte_sstore_handle_last_stable;
 110 #define ORTE_SSTORE_HANDLE_INVALID 0
 111 
 112 /**
 113  * Local and Global snapshot information structure
 114  * Primarily used by orte-restart as an abstract way to handle metadata
 115  */
 116 struct orte_sstore_base_local_snapshot_info_1_0_0_t {
 117     /** List super object */
 118     opal_list_item_t super;
 119 
 120     /** Stable Storage Handle */
 121     orte_sstore_base_handle_t ss_handle;
 122 
 123     /** ORTE Process name */
 124     orte_process_name_t process_name;
 125 
 126     /** CRS Component */
 127     char *crs_comp;
 128 
 129     /** Compress Component */
 130     char *compress_comp;
 131 
 132     /** Compress Component Postfix */
 133     char *compress_postfix;
 134 
 135     /** Start/End Timestamps */
 136     char *start_time;
 137     char *end_time;
 138 };
 139 typedef struct orte_sstore_base_local_snapshot_info_1_0_0_t orte_sstore_base_local_snapshot_info_1_0_0_t;
 140 typedef struct orte_sstore_base_local_snapshot_info_1_0_0_t orte_sstore_base_local_snapshot_info_t;
 141 
 142 ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sstore_base_local_snapshot_info_t);
 143 
 144 struct orte_sstore_base_global_snapshot_info_1_0_0_t {
 145     /** List super object */
 146     opal_list_item_t super;
 147 
 148     /** A list of orte_sstore_base_local_snapshot_info_t's */
 149     opal_list_t local_snapshots;
 150 
 151     /** Stable Storage Handle */
 152     orte_sstore_base_handle_t ss_handle;
 153 
 154     /** Start Timestamp */
 155     char * start_time;
 156 
 157     /** End Timestamp */
 158     char * end_time;
 159 
 160     /** Sequence number */
 161     int seq_num;
 162 
 163     /** Reference */
 164     char *reference;
 165 
 166     /** AMCA parameter used */
 167     char *amca_param;
 168 
 169     /** TUNE parameter used */
 170     char *tune_param;
 171 
 172     /** Internal use only: Cache some information on the structure */
 173     int num_seqs;
 174     char ** all_seqs;
 175     char *basedir;
 176     char *metadata_filename;
 177 };
 178 typedef struct orte_sstore_base_global_snapshot_info_1_0_0_t orte_sstore_base_global_snapshot_info_1_0_0_t;
 179 typedef struct orte_sstore_base_global_snapshot_info_1_0_0_t orte_sstore_base_global_snapshot_info_t;
 180 
 181 ORTE_DECLSPEC OBJ_CLASS_DECLARATION(orte_sstore_base_global_snapshot_info_t);
 182 
 183 /**
 184  * Module initialization function.
 185  * Returns ORTE_SUCCESS
 186  */
 187 typedef int (*orte_sstore_base_module_init_fn_t)
 188      (void);
 189 
 190 /**
 191  * Module finalization function.
 192  * Returns ORTE_SUCCESS
 193  */
 194 typedef int (*orte_sstore_base_module_finalize_fn_t)
 195      (void);
 196 
 197 /**
 198  * Request a checkpoint storage handle from stable storage
 199  *
 200  * @param handle Checkpoint storage handle
 201  * @param key Key to use as an identifier
 202  * @param value Value of the key specified
 203  *
 204  * @return ORTE_SUCCESS on success
 205  * @return ORTE_ERROR on failure
 206  */
 207 typedef int (*orte_sstore_base_request_checkpoint_handle_fn_t)
 208     (orte_sstore_base_handle_t *handle, int seq, orte_jobid_t jobid);
 209 
 210 /**
 211  * Request a restart storage handle from stable storage
 212  * This function will fail if the key cannot be matched.
 213  * If multiple matches exist, it will return the latest one.
 214  * If they key is NULL, then the latest entry will be used.
 215  *
 216  * @param handle Restart storage handle
 217  *
 218  * @return ORTE_SUCCESS on success
 219  * @return ORTE_ERROR on failure
 220  */
 221 typedef int (*orte_sstore_base_request_restart_handle_fn_t)
 222     (orte_sstore_base_handle_t *handle,
 223      char *basedir, char *ref, int seq,
 224      orte_sstore_base_global_snapshot_info_t *snapshot);
 225 
 226 /**
 227  * Request snapshot info from a given handle.
 228  * If they key is NULL, then the latest entry will be used.
 229  *
 230  * @param handle Restart storage handle
 231  *
 232  * @return ORTE_SUCCESS on success
 233  * @return ORTE_ERROR on failure
 234  */
 235 typedef int (*orte_sstore_base_request_global_snapshot_data_fn_t)
 236     (orte_sstore_base_handle_t *handle,
 237      orte_sstore_base_global_snapshot_info_t *snapshot);
 238 
 239 /**
 240  * Register access to a handle.
 241  *
 242  * @param handle Storage handle
 243  *
 244  * @return ORTE_SUCCESS on success
 245  * @return ORTE_ERROR on failure
 246  */
 247 typedef int (*orte_sstore_base_register_handle_fn_t)
 248     (orte_sstore_base_handle_t handle);
 249 
 250 /**
 251  * Get attribute on the storage handle
 252  *
 253  * @param handle Storage handle
 254  * @param key Key to access
 255  * @param value Value of the key. NULL if not avaialble
 256  *
 257  * @return ORTE_SUCCESS on success
 258  * @return ORTE_ERROR on failure
 259  */
 260 typedef int (*orte_sstore_base_get_attribute_fn_t)
 261     (orte_sstore_base_handle_t handle, orte_sstore_base_key_t key, char **value);
 262 
 263 /**
 264  * Set attribute on the storage handle
 265  *
 266  * @param handle Storage handle
 267  * @param key Key to set
 268  * @param value Value of the key.
 269  *
 270  * @return ORTE_SUCCESS on success
 271  * @return ORTE_ERROR on failure
 272  */
 273 typedef int (*orte_sstore_base_set_attribute_fn_t)
 274     (orte_sstore_base_handle_t handle, orte_sstore_base_key_t key, char *value);
 275 
 276 /**
 277  * Synchronize the handle
 278  *
 279  * @param handle Storage handle
 280  *
 281  * @return ORTE_SUCCESS on success
 282  * @return ORTE_ERROR on failure
 283  */
 284 typedef int (*orte_sstore_base_sync_fn_t)
 285     (orte_sstore_base_handle_t handle);
 286 
 287 /**
 288  * Remove data associated with the handle
 289  *
 290  * @param handle Storage handle
 291  *
 292  * @return ORTE_SUCCESS on success
 293  * @return ORTE_ERROR on failure
 294  */
 295 typedef int (*orte_sstore_base_remove_fn_t)
 296     (orte_sstore_base_handle_t handle);
 297 
 298 /**
 299  * Pack a handle into a buffer
 300  * Only called between the HNP and ORTED (or Global and Local SnapC coordinators)
 301  *
 302  * @param peer Peer to which this is being sent (or NULL if to all peers)
 303  * @param buffer Buffer to pack the data into
 304  * @param handle Storage handle
 305  *
 306  * @return ORTE_SUCCESS on success
 307  * @return ORTE_ERROR on failure
 308  */
 309 typedef int (*orte_sstore_base_pack_fn_t)
 310     (orte_process_name_t* peer, opal_buffer_t* buffer, orte_sstore_base_handle_t handle);
 311 
 312 /**
 313  * Unack a handle from a buffer
 314  * Only called between the HNP and ORTED (or Global and Local SnapC coordinators)
 315  *
 316  * @param peer Peer from which this was received
 317  * @param buffer Buffer to unpack the data
 318  * @param handle Storage handle
 319  *
 320  * @return ORTE_SUCCESS on success
 321  * @return ORTE_ERROR on failure
 322  */
 323 typedef int (*orte_sstore_base_unpack_fn_t)
 324     (orte_process_name_t* peer, opal_buffer_t* buffer, orte_sstore_base_handle_t *handle);
 325 
 326 /**
 327  * Fetch application context dependencies before local launch
 328  *
 329  * @param app Application context
 330  *
 331  * @return ORTE_SUCCESS on success
 332  * @return ORTE_ERROR on failure
 333  */
 334 typedef int (*orte_sstore_base_fetch_app_deps_fn_t)
 335     (orte_app_context_t *app);
 336 
 337 /**
 338  * Wait for all application context dependencies to be fetched
 339  *
 340  * @return ORTE_SUCCESS on success
 341  * @return ORTE_ERROR on failure
 342  */
 343 typedef int (*orte_sstore_base_wait_all_deps_fn_t)
 344     (void);
 345 
 346 /**
 347  * Structure for SSTORE components.
 348  */
 349 struct orte_sstore_base_component_2_0_0_t {
 350     /** MCA base component */
 351     mca_base_component_t base_version;
 352     /** MCA base data */
 353     mca_base_component_data_t base_data;
 354 
 355     /** Verbosity Level */
 356     int verbose;
 357     /** Output Handle for opal_output */
 358     int output_handle;
 359     /** Default Priority */
 360     int priority;
 361 };
 362 typedef struct orte_sstore_base_component_2_0_0_t orte_sstore_base_component_2_0_0_t;
 363 typedef struct orte_sstore_base_component_2_0_0_t orte_sstore_base_component_t;
 364 
 365 /**
 366  * Structure for SSTORE  modules
 367  */
 368 struct orte_sstore_base_module_1_0_0_t {
 369     /** Initialization Function */
 370     orte_sstore_base_module_init_fn_t           sstore_init;
 371     /** Finalization Function */
 372     orte_sstore_base_module_finalize_fn_t       sstore_finalize;
 373 
 374     /** Request handle */
 375     orte_sstore_base_request_checkpoint_handle_fn_t    request_checkpoint_handle;
 376     orte_sstore_base_request_restart_handle_fn_t       request_restart_handle;
 377     orte_sstore_base_request_global_snapshot_data_fn_t request_global_snapshot_data;
 378     orte_sstore_base_register_handle_fn_t              register_handle;
 379 
 380     /** Get/Set Attributes */
 381     orte_sstore_base_get_attribute_fn_t         get_attr;
 382     orte_sstore_base_set_attribute_fn_t         set_attr;
 383 
 384     /** Sync */
 385     orte_sstore_base_sync_fn_t                  sync;
 386 
 387     /** Remove */
 388     orte_sstore_base_remove_fn_t                remove;
 389 
 390     /** Pack/Unpack Handle */
 391     orte_sstore_base_pack_fn_t                  pack_handle;
 392     orte_sstore_base_unpack_fn_t                unpack_handle;
 393 
 394     /** Launch Helpers */
 395     orte_sstore_base_fetch_app_deps_fn_t        fetch_app_deps;
 396     orte_sstore_base_wait_all_deps_fn_t         wait_all_deps;
 397 };
 398 typedef struct orte_sstore_base_module_1_0_0_t orte_sstore_base_module_1_0_0_t;
 399 typedef struct orte_sstore_base_module_1_0_0_t orte_sstore_base_module_t;
 400 
 401 ORTE_DECLSPEC extern orte_sstore_base_module_t orte_sstore;
 402 
 403 /**
 404  * Macro for use in components that are of type SSTORE
 405  */
 406 #define ORTE_SSTORE_BASE_VERSION_2_0_0 \
 407     ORTE_MCA_BASE_VERSION_2_1_0("sstore", 2, 0, 0)
 408 
 409 END_C_DECLS
 410 
 411 #endif /* ORTE_SSTORE_H */
 412 

/* [<][>][^][v][top][bottom][index][help] */