root/opal/datatype/opal_convertor.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. opal_convertor_construct
  2. opal_convertor_destruct
  3. opal_convertor_destroy_masters
  4. opal_convertor_find_or_create_master
  5. opal_convertor_create
  6. opal_convertor_pack
  7. opal_convertor_unpack
  8. opal_convertor_create_stack_with_pos_contig
  9. opal_convertor_create_stack_at_begining
  10. opal_convertor_set_position_nocheck
  11. opal_datatype_compute_remote_size
  12. opal_convertor_compute_remote_size
  13. opal_convertor_prepare_for_recv
  14. opal_convertor_prepare_for_send
  15. opal_convertor_clone
  16. opal_convertor_dump
  17. opal_datatype_dump_stack

   1 /* -*- Mode: C; c-basic-offset:4 ; -*- */
   2 /*
   3  * Copyright (c) 2004-2006 The Trustees of Indiana University and Indiana
   4  *                         University Research and Technology
   5  *                         Corporation.  All rights reserved.
   6  * Copyright (c) 2004-2018 The University of Tennessee and The University
   7  *                         of Tennessee Research Foundation.  All rights
   8  *                         reserved.
   9  * Copyright (c) 2004-2006 High Performance Computing Center Stuttgart,
  10  *                         University of Stuttgart.  All rights reserved.
  11  * Copyright (c) 2004-2006 The Regents of the University of California.
  12  *                         All rights reserved.
  13  * Copyright (c) 2009      Oak Ridge National Labs.  All rights reserved.
  14  * Copyright (c) 2011      NVIDIA Corporation.  All rights reserved.
  15  * Copyright (c) 2013-2018 Research Organization for Information Science
  16  *                         and Technology (RIST).  All rights reserved.
  17  * Copyright (c) 2017      Intel, Inc. All rights reserved
  18  * $COPYRIGHT$
  19  *
  20  * Additional copyrights may follow
  21  *
  22  * $HEADER$
  23  */
  24 
  25 #include "opal_config.h"
  26 
  27 #include <stddef.h>
  28 #include <stdio.h>
  29 #include <stdint.h>
  30 
  31 #include "opal/prefetch.h"
  32 #include "opal/util/arch.h"
  33 #include "opal/util/output.h"
  34 
  35 #include "opal/datatype/opal_datatype_internal.h"
  36 #include "opal/datatype/opal_datatype.h"
  37 #include "opal/datatype/opal_convertor.h"
  38 #include "opal/datatype/opal_datatype_checksum.h"
  39 #include "opal/datatype/opal_datatype_prototypes.h"
  40 #include "opal/datatype/opal_convertor_internal.h"
  41 #if OPAL_CUDA_SUPPORT
  42 #include "opal/datatype/opal_datatype_cuda.h"
  43 #define MEMCPY_CUDA( DST, SRC, BLENGTH, CONVERTOR ) \
  44     CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
  45 #endif
  46 
  47 static void opal_convertor_construct( opal_convertor_t* convertor )
  48 {
  49     convertor->pStack         = convertor->static_stack;
  50     convertor->stack_size     = DT_STATIC_STACK_SIZE;
  51     convertor->partial_length = 0;
  52     convertor->remoteArch     = opal_local_arch;
  53     convertor->flags          = OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED;
  54 #if OPAL_CUDA_SUPPORT
  55     convertor->cbmemcpy       = &opal_cuda_memcpy;
  56 #endif
  57 }
  58 
  59 
  60 static void opal_convertor_destruct( opal_convertor_t* convertor )
  61 {
  62     opal_convertor_cleanup( convertor );
  63 }
  64 
  65 OBJ_CLASS_INSTANCE(opal_convertor_t, opal_object_t, opal_convertor_construct, opal_convertor_destruct );
  66 
  67 static opal_convertor_master_t* opal_convertor_master_list = NULL;
  68 
  69 extern conversion_fct_t opal_datatype_heterogeneous_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED];
  70 extern conversion_fct_t opal_datatype_copy_functions[OPAL_DATATYPE_MAX_PREDEFINED];
  71 
  72 void opal_convertor_destroy_masters( void )
  73 {
  74     opal_convertor_master_t* master = opal_convertor_master_list;
  75 
  76     while( NULL != master ) {
  77         opal_convertor_master_list = master->next;
  78         master->next = NULL;
  79         /* Cleanup the conversion function if not one of the defaults */
  80         if( (master->pFunctions != opal_datatype_heterogeneous_copy_functions) &&
  81             (master->pFunctions != opal_datatype_copy_functions) )
  82             free( master->pFunctions );
  83 
  84         free( master );
  85         master = opal_convertor_master_list;
  86     }
  87 }
  88 
  89 /**
  90  * Find or create a convertor suitable for the remote architecture. If there
  91  * is already a master convertor for this architecture then return it.
  92  * Otherwise, create and initialize a full featured master convertor.
  93  */
  94 opal_convertor_master_t* opal_convertor_find_or_create_master( uint32_t remote_arch )
  95 {
  96     opal_convertor_master_t* master = opal_convertor_master_list;
  97     int i;
  98     size_t* remote_sizes;
  99 
 100     while( NULL != master ) {
 101         if( master->remote_arch == remote_arch )
 102             return master;
 103         master = master->next;
 104     }
 105     /**
 106      * Create a new convertor matching the specified architecture and add it to the
 107      * master convertor list.
 108      */
 109     master = (opal_convertor_master_t*)malloc( sizeof(opal_convertor_master_t) );
 110     master->next = opal_convertor_master_list;
 111     opal_convertor_master_list = master;
 112     master->remote_arch = remote_arch;
 113     master->flags       = 0;
 114     master->hetero_mask = 0;
 115     /**
 116      * Most of the sizes will be identical, so for now just make a copy of
 117      * the local ones. As master->remote_sizes is defined as being an array of
 118      * consts we have to manually cast it before using it for writing purposes.
 119      */
 120     remote_sizes = (size_t*)master->remote_sizes;
 121     memcpy(remote_sizes, opal_datatype_local_sizes, sizeof(size_t) * OPAL_DATATYPE_MAX_PREDEFINED);
 122     /**
 123      * If the local and remote architecture are the same there is no need
 124      * to check for the remote data sizes. They will always be the same as
 125      * the local ones.
 126      */
 127     if( master->remote_arch == opal_local_arch ) {
 128         master->pFunctions = opal_datatype_copy_functions;
 129         master->flags |= CONVERTOR_HOMOGENEOUS;
 130         return master;
 131     }
 132 
 133     /* Find out the remote bool size */
 134     if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS8 ) ) {
 135         remote_sizes[OPAL_DATATYPE_BOOL] = 1;
 136     } else if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS16 ) ) {
 137         remote_sizes[OPAL_DATATYPE_BOOL] = 2;
 138     } else if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_BOOLIS32 ) ) {
 139         remote_sizes[OPAL_DATATYPE_BOOL] = 4;
 140     } else {
 141         opal_output( 0, "Unknown sizeof(bool) for the remote architecture\n" );
 142     }
 143 
 144     /**
 145      * Now we can compute the conversion mask. For all sizes where the remote
 146      * and local architecture differ a conversion is needed. Moreover, if the
 147      * 2 architectures don't have the same endianess all data with a length
 148      * over 2 bytes (with the exception of logicals) have to be byte-swapped.
 149      */
 150     for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
 151         if( remote_sizes[i] != opal_datatype_local_sizes[i] )
 152             master->hetero_mask |= (((uint32_t)1) << i);
 153     }
 154     if( opal_arch_checkmask( &master->remote_arch, OPAL_ARCH_ISBIGENDIAN ) !=
 155         opal_arch_checkmask( &opal_local_arch, OPAL_ARCH_ISBIGENDIAN ) ) {
 156         uint32_t hetero_mask = 0;
 157 
 158         for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
 159             if( remote_sizes[i] > 1 )
 160                 hetero_mask |= (((uint32_t)1) << i);
 161         }
 162         hetero_mask &= ~(((uint32_t)1) << OPAL_DATATYPE_BOOL);
 163         master->hetero_mask |= hetero_mask;
 164     }
 165     master->pFunctions = (conversion_fct_t*)malloc( sizeof(opal_datatype_heterogeneous_copy_functions) );
 166     /**
 167      * Usually the heterogeneous functions are slower than the copy ones. Let's
 168      * try to minimize the usage of the heterogeneous versions.
 169      */
 170     for( i = OPAL_DATATYPE_FIRST_TYPE; i < OPAL_DATATYPE_MAX_PREDEFINED; i++ ) {
 171         if( master->hetero_mask & (((uint32_t)1) << i) )
 172             master->pFunctions[i] = opal_datatype_heterogeneous_copy_functions[i];
 173         else
 174             master->pFunctions[i] = opal_datatype_copy_functions[i];
 175     }
 176 
 177     /* We're done so far, return the mater convertor */
 178     return master;
 179 }
 180 
 181 
 182 opal_convertor_t* opal_convertor_create( int32_t remote_arch, int32_t mode )
 183 {
 184     opal_convertor_t* convertor = OBJ_NEW(opal_convertor_t);
 185     opal_convertor_master_t* master;
 186 
 187     master = opal_convertor_find_or_create_master( remote_arch );
 188 
 189     convertor->remoteArch = remote_arch;
 190     convertor->stack_pos  = 0;
 191     convertor->flags      = master->flags;
 192     convertor->master     = master;
 193 
 194     return convertor;
 195 }
 196 
 197 #define OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( CONVERTOR, IOV, OUT, MAX_DATA ) \
 198     do {                                                                \
 199         /* protect against over packing data */                         \
 200         if( OPAL_UNLIKELY((CONVERTOR)->flags & CONVERTOR_COMPLETED) ) { \
 201             (IOV)[0].iov_len = 0;                                       \
 202             *(OUT) = 0;                                                 \
 203             *(MAX_DATA) = 0;                                            \
 204             return 1;  /* nothing to do */                              \
 205         }                                                               \
 206         (CONVERTOR)->checksum = OPAL_CSUM_ZERO;                         \
 207         (CONVERTOR)->csum_ui1 = 0;                                      \
 208         (CONVERTOR)->csum_ui2 = 0;                                      \
 209         assert( (CONVERTOR)->bConverted < (CONVERTOR)->local_size );    \
 210     } while(0)
 211 
 212 /**
 213  * Return 0 if everything went OK and if there is still room before the complete
 214  *          conversion of the data (need additional call with others input buffers )
 215  *        1 if everything went fine and the data was completly converted
 216  *       -1 something wrong occurs.
 217  */
 218 int32_t opal_convertor_pack( opal_convertor_t* pConv,
 219                              struct iovec* iov, uint32_t* out_size,
 220                              size_t* max_data )
 221 {
 222     OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );
 223 
 224     if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
 225         /**
 226          * We are doing conversion on a contiguous datatype on a homogeneous
 227          * environment. The convertor contain minimal information, we only
 228          * use the bConverted to manage the conversion.
 229          */
 230         uint32_t i;
 231         unsigned char* base_pointer;
 232         size_t pending_length = pConv->local_size - pConv->bConverted;
 233 
 234         *max_data = pending_length;
 235         opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );
 236 
 237         for( i = 0; i < *out_size; i++ ) {
 238             if( iov[i].iov_len >= pending_length ) {
 239                 goto complete_contiguous_data_pack;
 240             }
 241             if( OPAL_LIKELY(NULL == iov[i].iov_base) )
 242                 iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
 243             else
 244 #if OPAL_CUDA_SUPPORT
 245                 MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
 246 #else
 247                 MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
 248 #endif
 249             pending_length -= iov[i].iov_len;
 250             base_pointer += iov[i].iov_len;
 251         }
 252         *max_data -= pending_length;
 253         pConv->bConverted += (*max_data);
 254         return 0;
 255 
 256 complete_contiguous_data_pack:
 257         iov[i].iov_len = pending_length;
 258         if( OPAL_LIKELY(NULL == iov[i].iov_base) )
 259             iov[i].iov_base = (IOVBASE_TYPE *) base_pointer;
 260         else
 261 #if OPAL_CUDA_SUPPORT
 262             MEMCPY_CUDA( iov[i].iov_base, base_pointer, iov[i].iov_len, pConv );
 263 #else
 264             MEMCPY( iov[i].iov_base, base_pointer, iov[i].iov_len );
 265 #endif
 266         pConv->bConverted = pConv->local_size;
 267         *out_size = i + 1;
 268         pConv->flags |= CONVERTOR_COMPLETED;
 269         return 1;
 270     }
 271 
 272     return pConv->fAdvance( pConv, iov, out_size, max_data );
 273 }
 274 
 275 
 276 int32_t opal_convertor_unpack( opal_convertor_t* pConv,
 277                                struct iovec* iov, uint32_t* out_size,
 278                                size_t* max_data )
 279 {
 280     OPAL_CONVERTOR_SET_STATUS_BEFORE_PACK_UNPACK( pConv, iov, out_size, max_data );
 281 
 282     if( OPAL_LIKELY(pConv->flags & CONVERTOR_NO_OP) ) {
 283         /**
 284          * We are doing conversion on a contiguous datatype on a homogeneous
 285          * environment. The convertor contain minimal informations, we only
 286          * use the bConverted to manage the conversion.
 287          */
 288         uint32_t i;
 289         unsigned char* base_pointer;
 290         size_t pending_length = pConv->local_size - pConv->bConverted;
 291 
 292         *max_data = pending_length;
 293         opal_convertor_get_current_pointer( pConv, (void**)&base_pointer );
 294 
 295         for( i = 0; i < *out_size; i++ ) {
 296             if( iov[i].iov_len >= pending_length ) {
 297                 goto complete_contiguous_data_unpack;
 298             }
 299 #if OPAL_CUDA_SUPPORT
 300             MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
 301 #else
 302             MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
 303 #endif
 304             pending_length -= iov[i].iov_len;
 305             base_pointer += iov[i].iov_len;
 306         }
 307         *max_data -= pending_length;
 308         pConv->bConverted += (*max_data);
 309         return 0;
 310 
 311 complete_contiguous_data_unpack:
 312         iov[i].iov_len = pending_length;
 313 #if OPAL_CUDA_SUPPORT
 314         MEMCPY_CUDA( base_pointer, iov[i].iov_base, iov[i].iov_len, pConv );
 315 #else
 316         MEMCPY( base_pointer, iov[i].iov_base, iov[i].iov_len );
 317 #endif
 318         pConv->bConverted = pConv->local_size;
 319         *out_size = i + 1;
 320         pConv->flags |= CONVERTOR_COMPLETED;
 321         return 1;
 322     }
 323 
 324     return pConv->fAdvance( pConv, iov, out_size, max_data );
 325 }
 326 
 327 static inline int opal_convertor_create_stack_with_pos_contig( opal_convertor_t* pConvertor,
 328                                                                size_t starting_point, const size_t* sizes )
 329 {
 330     dt_stack_t* pStack;   /* pointer to the position on the stack */
 331     const opal_datatype_t* pData = pConvertor->pDesc;
 332     dt_elem_desc_t* pElems;
 333     size_t count;
 334     ptrdiff_t extent;
 335 
 336     pStack = pConvertor->pStack;
 337     /**
 338      * The prepare function already make the selection on which data representation
 339      * we have to use: normal one or the optimized version ?
 340      */
 341     pElems = pConvertor->use_desc->desc;
 342 
 343     count = starting_point / pData->size;
 344     extent = pData->ub - pData->lb;
 345 
 346     pStack[0].type     = OPAL_DATATYPE_LOOP;  /* the first one is always the loop */
 347     pStack[0].count    = pConvertor->count - count;
 348     pStack[0].index    = -1;
 349     pStack[0].disp     = count * extent;
 350 
 351     /* now compute the number of pending bytes */
 352     count = starting_point - count * pData->size;
 353     /**
 354      * We save the current displacement starting from the begining
 355      * of this data.
 356      */
 357     if( OPAL_LIKELY(0 == count) ) {
 358         pStack[1].type     = pElems->elem.common.type;
 359         pStack[1].count    = pElems->elem.count;
 360     } else {
 361         pStack[1].type  = OPAL_DATATYPE_UINT1;
 362         pStack[1].count = pData->size - count;
 363     }
 364     pStack[1].disp  = count;
 365     pStack[1].index = 0;  /* useless */
 366 
 367     pConvertor->bConverted = starting_point;
 368     pConvertor->stack_pos = 1;
 369     assert( 0 == pConvertor->partial_length );
 370     return OPAL_SUCCESS;
 371 }
 372 
 373 static inline
 374 int opal_convertor_create_stack_at_begining( opal_convertor_t* convertor,
 375                                              const size_t* sizes )
 376 {
 377     dt_stack_t* pStack = convertor->pStack;
 378     dt_elem_desc_t* pElems;
 379 
 380     /**
 381      * The prepare function already make the selection on which data representation
 382      * we have to use: normal one or the optimized version ?
 383      */
 384     pElems = convertor->use_desc->desc;
 385 
 386     convertor->stack_pos      = 1;
 387     convertor->partial_length = 0;
 388     convertor->bConverted     = 0;
 389     /**
 390      * Fill the first position on the stack. This one correspond to the
 391      * last fake OPAL_DATATYPE_END_LOOP that we add to the data representation and
 392      * allow us to move quickly inside the datatype when we have a count.
 393      */
 394     pStack[0].index = -1;
 395     pStack[0].count = convertor->count;
 396     pStack[0].disp  = 0;
 397     pStack[0].type  = OPAL_DATATYPE_LOOP;
 398 
 399     pStack[1].index = 0;
 400     pStack[1].disp = 0;
 401     if( pElems[0].elem.common.type == OPAL_DATATYPE_LOOP ) {
 402         pStack[1].count = pElems[0].loop.loops;
 403         pStack[1].type  = OPAL_DATATYPE_LOOP;
 404     } else {
 405         pStack[1].count = pElems[0].elem.count;
 406         pStack[1].type  = pElems[0].elem.common.type;
 407     }
 408     return OPAL_SUCCESS;
 409 }
 410 
 411 
 412 int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
 413                                              size_t* position )
 414 {
 415     int32_t rc;
 416 
 417     /**
 418      * create_stack_with_pos_contig always set the position relative to the ZERO
 419      * position, so there is no need for special handling. In all other cases,
 420      * if we plan to rollback the convertor then first we have to reset it at
 421      * the beginning.
 422      */
 423     if( OPAL_LIKELY(convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) ) {
 424         rc = opal_convertor_create_stack_with_pos_contig( convertor, (*position),
 425                                                           opal_datatype_local_sizes );
 426     } else {
 427         if( (0 == (*position)) || ((*position) < convertor->bConverted) ) {
 428             rc = opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes );
 429             if( 0 == (*position) ) return rc;
 430         }
 431         rc = opal_convertor_generic_simple_position( convertor, position );
 432         /**
 433          * If we have a non-contigous send convertor don't allow it move in the middle
 434          * of a predefined datatype, it won't be able to copy out the left-overs
 435          * anyway. Instead force the position to stay on predefined datatypes
 436          * boundaries. As we allow partial predefined datatypes on the contiguous
 437          * case, we should be accepted by any receiver convertor.
 438          */
 439         if( CONVERTOR_SEND & convertor->flags ) {
 440             convertor->bConverted -= convertor->partial_length;
 441             convertor->partial_length = 0;
 442         }
 443     }
 444     *position = convertor->bConverted;
 445     return rc;
 446 }
 447 
 448 static size_t
 449 opal_datatype_compute_remote_size( const opal_datatype_t* pData,
 450                                    const size_t* sizes )
 451 {
 452     uint32_t typeMask = pData->bdt_used;
 453     size_t length = 0;
 454 
 455     if (opal_datatype_is_predefined(pData)) {
 456         return sizes[pData->desc.desc->elem.common.type];
 457     }
 458 
 459     if( OPAL_UNLIKELY(NULL == pData->ptypes) ) {
 460         /* Allocate and fill the array of types used in the datatype description */
 461         opal_datatype_compute_ptypes( (opal_datatype_t*)pData );
 462     }
 463 
 464     for( int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) {
 465         if( typeMask & ((uint32_t)1 << i) ) {
 466             length += (pData->ptypes[i] * sizes[i]);
 467             typeMask ^= ((uint32_t)1 << i);
 468         }
 469     }
 470     return length;
 471 }
 472 
 473 /**
 474  * Compute the remote size. If necessary remove the homogeneous flag
 475  * and redirect the convertor description toward the non-optimized
 476  * datatype representation.
 477  */
 478 size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
 479 {
 480     opal_datatype_t* datatype = (opal_datatype_t*)pConvertor->pDesc;
 481     
 482     pConvertor->remote_size = pConvertor->local_size;
 483     if( OPAL_UNLIKELY(datatype->bdt_used & pConvertor->master->hetero_mask) ) {
 484         pConvertor->flags &= (~CONVERTOR_HOMOGENEOUS);
 485         if (!(pConvertor->flags & CONVERTOR_SEND && pConvertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS)) {
 486             pConvertor->use_desc = &(datatype->desc);
 487         }
 488         if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
 489             /* This is for a single datatype, we must update it with the count */
 490             pConvertor->remote_size = opal_datatype_compute_remote_size(datatype,
 491                                                                         pConvertor->master->remote_sizes);
 492             pConvertor->remote_size *= pConvertor->count;
 493         }
 494     }
 495     pConvertor->flags |= CONVERTOR_HAS_REMOTE_SIZE;
 496     return pConvertor->remote_size;
 497 }
 498 
 499 /**
 500  * This macro will initialize a convertor based on a previously created
 501  * convertor. The idea is the move outside these function the heavy
 502  * selection of architecture features for the convertors. I consider
 503  * here that the convertor is clean, either never initialized or already
 504  * cleaned.
 505  */
 506 #define OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf )  \
 507     {                                                                   \
 508         convertor->local_size = count * datatype->size;                 \
 509         convertor->pBaseBuf   = (unsigned char*)pUserBuf;               \
 510         convertor->count      = count;                                  \
 511         convertor->pDesc      = (opal_datatype_t*)datatype;             \
 512         convertor->bConverted = 0;                                      \
 513         convertor->use_desc   = &(datatype->opt_desc);                  \
 514         /* If the data is empty we just mark the convertor as           \
 515          * completed. With this flag set the pack and unpack functions  \
 516          * will not do anything.                                        \
 517          */                                                             \
 518         if( OPAL_UNLIKELY((0 == count) || (0 == datatype->size)) ) {    \
 519             convertor->flags |= (OPAL_DATATYPE_FLAG_NO_GAPS | CONVERTOR_COMPLETED | CONVERTOR_HAS_REMOTE_SIZE); \
 520             convertor->local_size = convertor->remote_size = 0;         \
 521             return OPAL_SUCCESS;                                        \
 522         }                                                               \
 523                                                                         \
 524         /* Grab the datatype part of the flags */                       \
 525         convertor->flags     &= CONVERTOR_TYPE_MASK;                    \
 526         convertor->flags     |= (CONVERTOR_DATATYPE_MASK & datatype->flags); \
 527         convertor->flags     |= (CONVERTOR_NO_OP | CONVERTOR_HOMOGENEOUS); \
 528                                                                         \
 529         convertor->remote_size = convertor->local_size;                 \
 530         if( OPAL_LIKELY(convertor->remoteArch == opal_local_arch) ) {   \
 531             if( !(convertor->flags & CONVERTOR_WITH_CHECKSUM) &&        \
 532                 ((convertor->flags & OPAL_DATATYPE_FLAG_NO_GAPS) || \
 533                  ((convertor->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS) && (1 == count))) ) { \
 534                 return OPAL_SUCCESS;                                    \
 535             }                                                           \
 536         }                                                               \
 537                                                                         \
 538         assert( (convertor)->pDesc == (datatype) );                     \
 539         opal_convertor_compute_remote_size( convertor );                \
 540         assert( NULL != convertor->use_desc->desc );                    \
 541         /* For predefined datatypes (contiguous) do nothing more */     \
 542         /* if checksum is enabled then always continue */               \
 543         if( ((convertor->flags & (CONVERTOR_WITH_CHECKSUM | OPAL_DATATYPE_FLAG_NO_GAPS)) \
 544              == OPAL_DATATYPE_FLAG_NO_GAPS) &&                          \
 545             ((convertor->flags & (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) == \
 546              (CONVERTOR_SEND | CONVERTOR_HOMOGENEOUS)) ) {              \
 547             return OPAL_SUCCESS;                                        \
 548         }                                                               \
 549         convertor->flags &= ~CONVERTOR_NO_OP;                           \
 550         {                                                               \
 551             uint32_t required_stack_length = datatype->loops + 1;       \
 552                                                                         \
 553             if( required_stack_length > convertor->stack_size ) {       \
 554                 assert(convertor->pStack == convertor->static_stack);   \
 555                 convertor->stack_size = required_stack_length;          \
 556                 convertor->pStack     = (dt_stack_t*)malloc(sizeof(dt_stack_t) * \
 557                                                             convertor->stack_size ); \
 558             }                                                           \
 559         }                                                               \
 560         opal_convertor_create_stack_at_begining( convertor, opal_datatype_local_sizes ); \
 561     }
 562 
 563 
 564 int32_t opal_convertor_prepare_for_recv( opal_convertor_t* convertor,
 565                                          const struct opal_datatype_t* datatype,
 566                                          size_t count,
 567                                          const void* pUserBuf )
 568 {
 569     /* Here I should check that the data is not overlapping */
 570 
 571     convertor->flags |= CONVERTOR_RECV;
 572 #if OPAL_CUDA_SUPPORT
 573     if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
 574         mca_cuda_convertor_init(convertor, pUserBuf);
 575     }
 576 #endif
 577 
 578     assert(! (convertor->flags & CONVERTOR_SEND));
 579     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 580 
 581     if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
 582         if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) {
 583             convertor->fAdvance = opal_unpack_general_checksum;
 584         } else {
 585             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
 586                 convertor->fAdvance = opal_unpack_homogeneous_contig_checksum;
 587             } else {
 588                 convertor->fAdvance = opal_generic_simple_unpack_checksum;
 589             }
 590         }
 591     } else {
 592         if( !(convertor->flags & CONVERTOR_HOMOGENEOUS) ) {
 593             convertor->fAdvance = opal_unpack_general;
 594         } else {
 595             if( convertor->pDesc->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
 596                 convertor->fAdvance = opal_unpack_homogeneous_contig;
 597             } else {
 598                 convertor->fAdvance = opal_generic_simple_unpack;
 599             }
 600         }
 601     }
 602     return OPAL_SUCCESS;
 603 }
 604 
 605 
 606 int32_t opal_convertor_prepare_for_send( opal_convertor_t* convertor,
 607                                          const struct opal_datatype_t* datatype,
 608                                          size_t count,
 609                                          const void* pUserBuf )
 610 {
 611     convertor->flags |= CONVERTOR_SEND;
 612 #if OPAL_CUDA_SUPPORT
 613     if (!( convertor->flags & CONVERTOR_SKIP_CUDA_INIT )) {
 614         mca_cuda_convertor_init(convertor, pUserBuf);
 615     }
 616 #endif
 617 
 618     OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf );
 619 
 620     if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) {
 621         if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
 622             convertor->fAdvance = opal_pack_general_checksum;
 623         } else {
 624             if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
 625                 if( ((datatype->ub - datatype->lb) == (ptrdiff_t)datatype->size)
 626                     || (1 >= convertor->count) )
 627                     convertor->fAdvance = opal_pack_homogeneous_contig_checksum;
 628                 else
 629                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps_checksum;
 630             } else {
 631                 convertor->fAdvance = opal_generic_simple_pack_checksum;
 632             }
 633         }
 634     } else {
 635         if( CONVERTOR_SEND_CONVERSION == (convertor->flags & (CONVERTOR_SEND_CONVERSION|CONVERTOR_HOMOGENEOUS)) ) {
 636             convertor->fAdvance = opal_pack_general;
 637         } else {
 638             if( datatype->flags & OPAL_DATATYPE_FLAG_CONTIGUOUS ) {
 639                 if( ((datatype->ub - datatype->lb) == (ptrdiff_t)datatype->size)
 640                     || (1 >= convertor->count) )
 641                     convertor->fAdvance = opal_pack_homogeneous_contig;
 642                 else
 643                     convertor->fAdvance = opal_pack_homogeneous_contig_with_gaps;
 644             } else {
 645                 convertor->fAdvance = opal_generic_simple_pack;
 646             }
 647         }
 648     }
 649     return OPAL_SUCCESS;
 650 }
 651 
 652 /*
 653  * These functions can be used in order to create an IDENTICAL copy of one convertor. In this
 654  * context IDENTICAL means that the datatype and count and all other properties of the basic
 655  * convertor get replicated on this new convertor. However, the references to the datatype
 656  * are not increased. This function take special care about the stack. If all the cases the
 657  * stack is created with the correct number of entries but if the copy_stack is true (!= 0)
 658  * then the content of the old stack is copied on the new one. The result will be a convertor
 659  * ready to use starting from the old position. If copy_stack is false then the convertor
 660  * is created with a empty stack (you have to use opal_convertor_set_position before using it).
 661  */
 662 int opal_convertor_clone( const opal_convertor_t* source,
 663                           opal_convertor_t* destination,
 664                           int32_t copy_stack )
 665 {
 666     destination->remoteArch        = source->remoteArch;
 667     destination->flags             = source->flags;
 668     destination->pDesc             = source->pDesc;
 669     destination->use_desc          = source->use_desc;
 670     destination->count             = source->count;
 671     destination->pBaseBuf          = source->pBaseBuf;
 672     destination->fAdvance          = source->fAdvance;
 673     destination->master            = source->master;
 674     destination->local_size        = source->local_size;
 675     destination->remote_size       = source->remote_size;
 676     /* create the stack */
 677     if( OPAL_UNLIKELY(source->stack_size > DT_STATIC_STACK_SIZE) ) {
 678         destination->pStack = (dt_stack_t*)malloc(sizeof(dt_stack_t) * source->stack_size );
 679     } else {
 680         destination->pStack = destination->static_stack;
 681     }
 682     destination->stack_size = source->stack_size;
 683 
 684     /* initialize the stack */
 685     if( OPAL_LIKELY(0 == copy_stack) ) {
 686         destination->bConverted = -1;
 687         destination->stack_pos  = -1;
 688     } else {
 689         memcpy( destination->pStack, source->pStack, sizeof(dt_stack_t) * (source->stack_pos+1) );
 690         destination->bConverted = source->bConverted;
 691         destination->stack_pos  = source->stack_pos;
 692     }
 693 #if OPAL_CUDA_SUPPORT
 694     destination->cbmemcpy   = source->cbmemcpy;
 695 #endif
 696     return OPAL_SUCCESS;
 697 }
 698 
 699 
 700 void opal_convertor_dump( opal_convertor_t* convertor )
 701 {
 702     opal_output( 0, "Convertor %p count %" PRIsize_t " stack position %u bConverted %" PRIsize_t "\n"
 703                  "\tlocal_size %" PRIsize_t " remote_size %" PRIsize_t " flags %X stack_size %u pending_length %" PRIsize_t "\n"
 704                  "\tremote_arch %u local_arch %u\n",
 705                  (void*)convertor,
 706                  convertor->count, convertor->stack_pos, convertor->bConverted,
 707                  convertor->local_size, convertor->remote_size,
 708                  convertor->flags, convertor->stack_size, convertor->partial_length,
 709                  convertor->remoteArch, opal_local_arch );
 710     if( convertor->flags & CONVERTOR_RECV ) opal_output( 0, "unpack ");
 711     if( convertor->flags & CONVERTOR_SEND ) opal_output( 0, "pack ");
 712     if( convertor->flags & CONVERTOR_SEND_CONVERSION ) opal_output( 0, "conversion ");
 713     if( convertor->flags & CONVERTOR_HOMOGENEOUS ) opal_output( 0, "homogeneous " );
 714     else opal_output( 0, "heterogeneous ");
 715     if( convertor->flags & CONVERTOR_NO_OP ) opal_output( 0, "no_op ");
 716     if( convertor->flags & CONVERTOR_WITH_CHECKSUM ) opal_output( 0, "checksum ");
 717     if( convertor->flags & CONVERTOR_CUDA ) opal_output( 0, "CUDA ");
 718     if( convertor->flags & CONVERTOR_CUDA_ASYNC ) opal_output( 0, "CUDA Async ");
 719     if( convertor->flags & CONVERTOR_COMPLETED ) opal_output( 0, "COMPLETED ");
 720 
 721     opal_datatype_dump( convertor->pDesc );
 722     if( !((0 == convertor->stack_pos) &&
 723           ((size_t)convertor->pStack[convertor->stack_pos].index > convertor->pDesc->desc.length)) ) {
 724         /* only if the convertor is completely initialized */
 725         opal_output( 0, "Actual stack representation\n" );
 726         opal_datatype_dump_stack( convertor->pStack, convertor->stack_pos,
 727                                   convertor->pDesc->desc.desc, convertor->pDesc->name );
 728     }
 729 }
 730 
 731 
 732 void opal_datatype_dump_stack( const dt_stack_t* pStack, int stack_pos,
 733                                const union dt_elem_desc* pDesc, const char* name )
 734 {
 735     opal_output( 0, "\nStack %p stack_pos %d name %s\n", (void*)pStack, stack_pos, name );
 736     for( ; stack_pos >= 0; stack_pos-- ) {
 737         opal_output( 0, "%d: pos %d count %" PRIsize_t " disp %ld ", stack_pos, pStack[stack_pos].index,
 738                      pStack[stack_pos].count, pStack[stack_pos].disp );
 739         if( pStack->index != -1 )
 740             opal_output( 0, "\t[desc count %lu disp %ld extent %ld]\n",
 741                          (unsigned long)pDesc[pStack[stack_pos].index].elem.count,
 742                          (long)pDesc[pStack[stack_pos].index].elem.disp,
 743                          (long)pDesc[pStack[stack_pos].index].elem.extent );
 744         else
 745             opal_output( 0, "\n" );
 746     }
 747     opal_output( 0, "\n" );
 748 }

/* [<][>][^][v][top][bottom][index][help] */