#ifndef __conf_h__
#include "conf.h"
#endif

#ifndef __dist_h__
#define __dist_h__

/* for block distribution */

#define procid_block(i, NUM_DATA) \
    ((i) / (((NUM_DATA) + NUM_PROC - 1) / NUM_PROC))

#define toglobal_block(cid, i, NUM_DATA) \
    (((NUM_DATA) + NUM_PROC - 1) / NUM_PROC * (cid) + (i))


/* for cyclic distribution */

#define procid_cyclic(i, NUM_DATA) \
    ((i) % NUM_PROC)

#define num_in_proc_cyclic(cid, NUM_DATA) \
    ((cid) < (NUM_DATA) % NUM_PROC \
     ? (NUM_DATA) / NUM_PROC + 1 \
     : (NUM_DATA) / NUM_PROC)

#define toglobal_cyclic(cid, i, NUM_DATA) \
    ((i) * NUM_PROC + (cid))

#define tolocal_cyclic(cid, i, NUM_DATA) \
    ((i) / NUM_PROC)



/* for dot distribution */

#define procid_cyclic_x(i, NUM_DATA) \
    ((i) % NUM_PROC_X)

#define num_in_proc_cyclic_x(cid, NUM_DATA) \
    ((cid) < (NUM_DATA) % NUM_PROC_X \
     ? (NUM_DATA) / NUM_PROC_X + 1 \
     : (NUM_DATA) / NUM_PROC_X)

#define toglobal_cyclic_x(cid, i, NUM_DATA) \
    ((i) * NUM_PROC_X + (cid))

#define tolocal_cyclic_x(cid, i, NUM_DATA) \
    ((i) / NUM_PROC_X)

#define procid_cyclic_y(i, NUM_DATA) \
    ((i) % NUM_PROC_Y)

#define num_in_proc_cyclic_y(cid, NUM_DATA) \
    ((cid) < (NUM_DATA) % NUM_PROC_Y \
     ? (NUM_DATA) / NUM_PROC_Y + 1 \
     : (NUM_DATA) / NUM_PROC_Y)

#define toglobal_cyclic_y(cid, i, NUM_DATA) \
    ((i) * NUM_PROC_Y + (cid))

#define tolocal_cyclic_y(cid, i, NUM_DATA) \
    ((i) / NUM_PROC_Y)


/*
  for dot(8, 1) distribution

  optimized version works correctly when both NUM_PROC_X and
  BLOCK_SIZE are the power of 2.
 */

#define OPTIMIZE_VERSION

#define BLOCK_SIZE	8
#define BLOCK_MASK	(BLOCK_SIZE - 1)
#define PROC_BLOCK_MASK	(NUM_PROC_X * BLOCK_SIZE - 1)

#ifndef OPTIMIZE_VERSION
#define procid_dot_x(i, NUM_DATA) \
    ((i) / BLOCK_SIZE % NUM_PROC_X)
#else /* optimized version */
#define procid_dot_x(i, NUM_DATA) \
    ((i) / BLOCK_SIZE & MASK_X)
#endif

#ifndef OPTIMIZE_VERSION
#define toglobal_dot_x(cid, i, NUM_DATA) \
    (((i) / BLOCK_SIZE) * NUM_PROC_X * BLOCK_SIZE \
     + (cid) * BLOCK_SIZE \
     + (i) % BLOCK_SIZE)
#else /* optimized version */
#if (NUM_PROC_X != BLOCK_SIZE)
#define toglobal_dot_x(cid, i, NUM_DATA) \
    (((i) & ~BLOCK_MASK) * NUM_PROC_X \
     + (cid) * BLOCK_SIZE \
     + ((i) & BLOCK_MASK))
#else /* if NUM_PROC_X == BLOCK_SIZE */
#define toglobal_dot_x(cid, i, NUM_DATA) \
    ((((i) & ~BLOCK_MASK) + (cid)) * NUM_PROC_X \
     + ((i) & BLOCK_MASK))
#endif
#endif

#ifndef OPTIMIZE_VERSION
#define tolocal_dot_x(cid, i, NUM_DATA) \
    (((i) / BLOCK_SIZE) / NUM_PROC_X * BLOCK_SIZE + (i) % BLOCK_SIZE)
#else /* optimized version */
#define tolocal_dot_x(cid, i, NUM_DATA) \
    (((i) & ~PROC_BLOCK_MASK) / NUM_PROC_X + ((i) & BLOCK_MASK))
#endif

#ifndef OPTIMIZE_VERSION
#define procid_dot_y(i, NUM_DATA) \
    ((i) % NUM_PROC_Y)
#else /* optimized code */
#define procid_dot_y(i, NUM_DATA) \
    ((i) & MASK_Y)
#endif

#ifndef OPTIMIZE_VERSION
#define num_in_proc_dot_y(cid, NUM_DATA) \
    ((cid) < (NUM_DATA) % NUM_PROC_Y \
     ? (NUM_DATA) / NUM_PROC_Y + 1 \
     : (NUM_DATA) / NUM_PROC_Y)
#else /* optimized code */
#define num_in_proc_dot_y(cid, NUM_DATA) \
    ((cid) < ((NUM_DATA) & MASK_Y) \
     ? (NUM_DATA) / NUM_PROC_Y + 1 \
     : (NUM_DATA) / NUM_PROC_Y)
#endif

#define toglobal_dot_y(cid, i, NUM_DATA) \
    ((i) * NUM_PROC_Y + (cid))

#define tolocal_dot_y(cid, i, NUM_DATA) \
    ((i) / NUM_PROC_Y)

#endif
