root/ompi/mca/io/romio321/romio/adio/common/ad_open.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ADIO_Open
  2. is_aggregator
  3. uses_generic_read
  4. uses_generic_write
  5. build_cb_config_list

   1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil ; -*- */
   2 /* 
   3  *
   4  *   Copyright (C) 1997 University of Chicago. 
   5  *   See COPYRIGHT notice in top-level directory.
   6  */
   7 
   8 #include "adio.h"
   9 #include "adio_extern.h"
  10 #include "adio_cb_config_list.h"
  11 
  12 #include "mpio.h"
  13 static int is_aggregator(int rank, ADIO_File fd);
  14 static int uses_generic_read(ADIO_File fd);
  15 static int uses_generic_write(ADIO_File fd);
  16 static int build_cb_config_list(ADIO_File fd, 
  17         MPI_Comm orig_comm, MPI_Comm comm, 
  18         int rank, int procs, int *error_code);
  19 
  20 MPI_File ADIO_Open(MPI_Comm orig_comm,
  21                    MPI_Comm comm, const char *filename, int file_system,
  22                    ADIOI_Fns *ops,
  23                    int access_mode, ADIO_Offset disp, MPI_Datatype etype, 
  24                    MPI_Datatype filetype,
  25                    MPI_Info info, int perm, int *error_code)
  26 {
  27     MPI_File mpi_fh;
  28     ADIO_File fd;
  29     int err, rank, procs;
  30     static char myname[] = "ADIO_OPEN";
  31     int  max_error_code;
  32     MPI_Info dupinfo;
  33     int syshints_processed, can_skip;
  34     char *p;
  35 
  36     *error_code = MPI_SUCCESS;
  37 
  38     /* obtain MPI_File handle */
  39     mpi_fh = MPIO_File_create(sizeof(struct ADIOI_FileD));
  40     if (mpi_fh == MPI_FILE_NULL) {
  41         fd = MPI_FILE_NULL;
  42         *error_code = MPIO_Err_create_code(*error_code,
  43                                            MPIR_ERR_RECOVERABLE,
  44                                            myname,
  45                                            __LINE__,
  46                                            MPI_ERR_OTHER,
  47                                            "**nomem2",0);
  48         goto fn_exit;
  49 
  50     }
  51     fd = MPIO_File_resolve(mpi_fh);
  52 
  53     fd->cookie = ADIOI_FILE_COOKIE;
  54     fd->fp_ind = disp;
  55     fd->fp_sys_posn = 0;
  56     fd->comm = comm;       /* dup'ed in MPI_File_open */
  57     fd->filename = ADIOI_Strdup(filename);
  58     fd->file_system = file_system;
  59     fd->fs_ptr = NULL;
  60 
  61     fd->fns = ops;
  62 
  63     fd->disp = disp;
  64     fd->split_coll_count = 0;
  65     fd->shared_fp_fd = ADIO_FILE_NULL;
  66     fd->atomicity = 0;
  67     fd->etype = etype;          /* MPI_BYTE by default */
  68     fd->filetype = filetype;    /* MPI_BYTE by default */
  69     fd->etype_size = 1;  /* default etype is MPI_BYTE */
  70 
  71     fd->file_realm_st_offs = NULL;
  72     fd->file_realm_types = NULL;
  73 
  74     fd->perm = perm;
  75 
  76     fd->async_count = 0;
  77 
  78     fd->fortran_handle = -1;
  79 
  80     fd->err_handler = ADIOI_DFLT_ERR_HANDLER;
  81 
  82     fd->io_buf_window = MPI_WIN_NULL;
  83     fd->io_buf_put_amounts_window = MPI_WIN_NULL;
  84 
  85     MPI_Comm_rank(comm, &rank);
  86     MPI_Comm_size(comm, &procs);
  87 /* create and initialize info object */
  88     fd->hints = (ADIOI_Hints *)ADIOI_Calloc(1, sizeof(struct ADIOI_Hints_struct));
  89     if (fd->hints == NULL) {
  90         *error_code = MPIO_Err_create_code(*error_code,
  91                                            MPIR_ERR_RECOVERABLE,
  92                                            myname,
  93                                            __LINE__,
  94                                            MPI_ERR_OTHER,
  95                                            "**nomem2",0);
  96         goto fn_exit;
  97     }
  98     fd->hints->cb_config_list = NULL;
  99     fd->hints->ranklist = NULL;
 100     fd->hints->initialized = 0;
 101     fd->info = MPI_INFO_NULL;
 102 
 103     /* move system-wide hint processing *back* into open, but this time the
 104      * hintfile reader will do a scalable read-and-broadcast.  The global
 105      * ADIOI_syshints will get initialized at first open.  subsequent open
 106      * calls will just use result from first open.
 107      *
 108      * We have two goals here:
 109      * 1: avoid processing the hintfile multiple times
 110      * 2: have all processes participate in hintfile processing (so we can read-and-broadcast)
 111      *
 112      * a code might do an "initialize from 0", so we can only skip hint
 113      * processing once everyone has particpiated in hint processing */
 114     if (ADIOI_syshints == MPI_INFO_NULL)
 115         syshints_processed = 0;
 116     else
 117         syshints_processed = 1;
 118 
 119     MPI_Allreduce(&syshints_processed, &can_skip, 1, MPI_INT, MPI_MIN, fd->comm);
 120     if (!can_skip) {
 121         if (ADIOI_syshints == MPI_INFO_NULL)
 122             MPI_Info_create(&ADIOI_syshints);
 123         ADIOI_process_system_hints(fd, ADIOI_syshints);
 124     }
 125 
 126     ADIOI_incorporate_system_hints(info, ADIOI_syshints, &dupinfo);
 127     ADIO_SetInfo(fd, dupinfo, &err);
 128     if (dupinfo != MPI_INFO_NULL) {
 129         *error_code = MPI_Info_free(&dupinfo);
 130         if (*error_code != MPI_SUCCESS)
 131             goto fn_exit;
 132     }
 133     ADIOI_Info_set(fd->info, "romio_filesystem_type", fd->fns->fsname);
 134 
 135     /* Instead of repeatedly allocating this buffer in collective read/write,
 136      * allocating up-front might make memory management on small platforms
 137      * (e.g. Blue Gene) more efficent */
 138 
 139     fd->io_buf = ADIOI_Malloc(fd->hints->cb_buffer_size);
 140      /* deferred open: 
 141      * we can only do this optimization if 'fd->hints->deferred_open' is set
 142      * (which means the user hinted 'no_indep_rw' and collective buffering).
 143      * Furthermore, we only do this if our collective read/write routines use
 144      * our generic function, and not an fs-specific routine (we can defer opens
 145      * only if we use our aggreagation code). */
 146     if (fd->hints->deferred_open && 
 147                     !(uses_generic_read(fd) \
 148                             && uses_generic_write(fd))) {
 149             fd->hints->deferred_open = 0;
 150     }
 151     if (ADIO_Feature(fd, ADIO_SCALABLE_OPEN))
 152             /* disable deferred open on these fs so that scalable broadcast
 153              * will always use the propper communicator */
 154             fd->hints->deferred_open = 0;
 155 
 156 
 157     /* on BlueGene, the cb_config_list is built when hints are processed. No
 158      * one else does that right now */
 159     if (fd->hints->ranklist == NULL) {
 160         build_cb_config_list(fd, orig_comm, comm, rank, procs, error_code);
 161         if (*error_code != MPI_SUCCESS) 
 162             goto fn_exit;
 163     }
 164     fd->is_open = 0;
 165     fd->my_cb_nodes_index = -2;
 166     fd->is_agg = is_aggregator(rank, fd);
 167     /* deferred open used to split the communicator to create an "aggregator
 168      * communicator", but we only used it as a way to indicate that deferred
 169      * open happened.  fd->is_open and fd->is_agg are sufficient */
 170 
 171     /* actual opens start here */
 172     /* generic open: one process opens to create the file, all others open */
 173     /* nfs open: everybody opens or else you'll end up with "file not found"
 174      * due to stupid nfs consistency semantics */
 175     /* scalable open: one process opens and broadcasts results to everyone */
 176 
 177     ADIOI_OpenColl(fd, rank, access_mode, error_code);
 178 
 179     /* deferred open consideration: if an independent process lied about
 180      * "no_indep_rw" and opens the file later (example: HDF5 uses independent
 181      * i/o for metadata), that deferred open will use the access_mode provided
 182      * by the user.  CREATE|EXCL only makes sense here -- exclusive access in
 183      * the deferred open case is going to fail and surprise the user.  Turn off
 184      * the excl amode bit. Save user's ammode for MPI_FILE_GET_AMODE */
 185     fd->orig_access_mode = access_mode;
 186     if (fd->access_mode & ADIO_EXCL) fd->access_mode ^= ADIO_EXCL;
 187 
 188 
 189     /* for debugging, it can be helpful to see the hints selected. Some file
 190      * systes set up the hints in the open call (e.g. lustre) */
 191     p = getenv("ROMIO_PRINT_HINTS");
 192     if (rank == 0 && p != NULL ) {
 193         ADIOI_Info_print_keyvals(fd->info);
 194     }
 195 
 196  fn_exit:
 197     MPI_Allreduce(error_code, &max_error_code, 1, MPI_INT, MPI_MAX, comm);
 198     if (max_error_code != MPI_SUCCESS) {
 199 
 200         /* If the file was successfully opened, close it */
 201         if (*error_code == MPI_SUCCESS) {
 202         
 203             /* in the deferred open case, only those who have actually
 204                opened the file should close it */
 205             if (fd->hints->deferred_open)  {
 206                 if (fd->is_agg) {
 207                     (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
 208                 }
 209             }
 210             else {
 211                 (*(fd->fns->ADIOI_xxx_Close))(fd, error_code);
 212             }
 213         }
 214         ADIOI_Free(fd->filename);
 215         ADIOI_Free(fd->hints->ranklist);
 216         if ( fd->hints->cb_config_list != NULL ) ADIOI_Free(fd->hints->cb_config_list);
 217         ADIOI_Free(fd->hints);
 218         if (fd->info != MPI_INFO_NULL) MPI_Info_free(&(fd->info));
 219         ADIOI_Free(fd->io_buf);
 220         ADIOI_Free(fd);
 221         fd = ADIO_FILE_NULL;
 222         if (*error_code == MPI_SUCCESS)
 223         {
 224             *error_code = MPIO_Err_create_code(MPI_SUCCESS,
 225                                                MPIR_ERR_RECOVERABLE, myname,
 226                                                __LINE__, MPI_ERR_IO,
 227                                                "**oremote_fail", 0);
 228         }
 229     }
 230 
 231     return fd;
 232 }
 233 
 234 /* a simple linear search. possible enancement: add a my_cb_nodes_index member
 235  * ( index into cb_nodes, else -1 if not aggregator ) for faster lookups 
 236  *
 237  * fd->hints->cb_nodes is the number of aggregators
 238  * fd->hints->ranklist[] is an array of the ranks of aggregators
 239  *
 240  * might want to move this to adio/common/cb_config_list.c 
 241  */
 242 int is_aggregator(int rank, ADIO_File fd ) {
 243         int i;
 244         
 245         if (fd->my_cb_nodes_index == -2) {
 246             for (i=0; i< fd->hints->cb_nodes; i++ ) {
 247                 if ( rank == fd->hints->ranklist[i] ) {
 248                     fd->my_cb_nodes_index = i;
 249                     return 1;
 250                 }
 251             }
 252             fd->my_cb_nodes_index = -1;
 253         }
 254         else if (fd->my_cb_nodes_index != -1)
 255             return 1;
 256 
 257         return 0;
 258 }
 259 
 260 /*
 261  * If file system implements some version of two-phase -- doesn't have to be
 262  * generic -- we can still carry out the defered open optimization
 263  */
 264 static int uses_generic_read(ADIO_File fd)
 265 {
 266     if (ADIO_Feature(fd, ADIO_TWO_PHASE))
 267         return 1;
 268     return 0;
 269 }
 270 
 271 static int uses_generic_write(ADIO_File fd)
 272 {
 273     if (ADIO_Feature(fd, ADIO_TWO_PHASE))
 274         return 1;
 275     return 0;
 276 }
 277 
 278 static int build_cb_config_list(ADIO_File fd, 
 279         MPI_Comm orig_comm, MPI_Comm comm, 
 280         int rank, int procs, int *error_code)
 281 {
 282     ADIO_cb_name_array array;
 283     int *tmp_ranklist;
 284     int rank_ct;
 285     char *value;
 286     static char myname[] = "ADIO_OPEN cb_config_list";
 287 
 288     /* gather the processor name array if we don't already have it */
 289     /* this has to be done early in ADIO_Open so that we can cache the name
 290      * array in both the dup'd communicator (in case we want it later) and the
 291      * original communicator */
 292     ADIOI_cb_gather_name_array(orig_comm, comm, &array);
 293 
 294 /* parse the cb_config_list and create a rank map on rank 0 */
 295     if (rank == 0) {
 296         tmp_ranklist = (int *) ADIOI_Malloc(sizeof(int) * procs);
 297         if (tmp_ranklist == NULL) {
 298             *error_code = MPIO_Err_create_code(*error_code,
 299                                                MPIR_ERR_RECOVERABLE,
 300                                                myname,
 301                                                __LINE__,
 302                                                MPI_ERR_OTHER,
 303                                                "**nomem2",0);
 304             return 0;
 305         }
 306 
 307         rank_ct = ADIOI_cb_config_list_parse(fd->hints->cb_config_list, 
 308                                              array, tmp_ranklist,
 309                                              fd->hints->cb_nodes);
 310 
 311         /* store the ranklist using the minimum amount of memory */
 312         if (rank_ct > 0) {
 313             fd->hints->ranklist = (int *) ADIOI_Malloc(sizeof(int) * rank_ct);
 314             memcpy(fd->hints->ranklist, tmp_ranklist, sizeof(int) * rank_ct);
 315         }
 316         ADIOI_Free(tmp_ranklist);
 317         fd->hints->cb_nodes = rank_ct;
 318         /* TEMPORARY -- REMOVE WHEN NO LONGER UPDATING INFO FOR FS-INDEP. */
 319         value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char));
 320         ADIOI_Snprintf(value, MPI_MAX_INFO_VAL+1, "%d", rank_ct);
 321         ADIOI_Info_set(fd->info, "cb_nodes", value);
 322         ADIOI_Free(value);
 323     }
 324 
 325     ADIOI_cb_bcast_rank_map(fd);
 326     if (fd->hints->cb_nodes <= 0) {
 327         *error_code = MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE,
 328                                            myname, __LINE__, MPI_ERR_IO,
 329                                            "**ioagnomatch", 0);
 330         fd = ADIO_FILE_NULL;
 331     }
 332     return 0;
 333 }
 334 
 335 /* 
 336  * vim: ts=8 sts=4 sw=4 noexpandtab 
 337  */

/* [<][>][^][v][top][bottom][index][help] */