/*
 *  routine for redistribution for the Cenju-3
 */

#include "conf.h"
#include "dist.h"
#include "misc.h"


void
trans_block_dot_mat_vec(cid, size, pA, a, pB)
    int cid;
    int size;
    mat pA;
    double *a;
    dot_mat pB;
{
    int i;
    int cidx, cidy;
    int ns = num_in_proc_block(cid, size);
    struct msg_type {
	int size;
	double*	adr;
	double	data[NUM_PER_PROC_X + 1];
    } msg1[NUM_PROC_X];
    int nrx;

    lin_trec(cid, cidx, cidy);
    nrx = num_in_proc_dot_x(cidx, size);

    for (i = 0; i < ns; i++) {
	int j;
	int g_i = toglobal_block(cid, i, size);
	int pidy = procid_dot_y(g_i, size); 
	int l_i = tolocal_dot_y(pidy, g_i, size);

	for (j = 0; j < NUM_PROC_X; j++)
	    msg1[j].size = 0;

	for (j = 0; j < size - BLOCK_SIZE + 1; j += BLOCK_SIZE) {
	    int pidx = procid_dot_x(j, size); 
	    int l_j = tolocal_dot_x(pidx, j, size);
	    int pid = rec_tlin(pidx, pidy);
	    if (pid == cid) {
		/* ***** WARNING ***** */
		/* this code depends BLOCK SIZE */
		pB[l_i][l_j] = pA[i][j];
		pB[l_i][l_j + 1] = pA[i][j + 1];
		pB[l_i][l_j + 2] = pA[i][j + 2];
		pB[l_i][l_j + 3] = pA[i][j + 3];
		pB[l_i][l_j + 4] = pA[i][j + 4];
		pB[l_i][l_j + 5] = pA[i][j + 5];
		pB[l_i][l_j + 6] = pA[i][j + 6];
		pB[l_i][l_j + 7] = pA[i][j + 7];
	    }
	    else {
		struct msg_type *msgp = &msg1[pidx];
		double *mp = &msgp->data[msgp->size];
		if (msgp->size == 0)
		    msgp->adr = &pB[l_i][l_j];
		mp[0] = pA[i][j];
		mp[1] = pA[i][j + 1];
		mp[2] = pA[i][j + 2];
		mp[3] = pA[i][j + 3];
		mp[4] = pA[i][j + 4];
		mp[5] = pA[i][j + 5];
		mp[6] = pA[i][j + 6];
		mp[7] = pA[i][j + 7];
		msgp->size += BLOCK_SIZE;
	    }
	}
	if (j < size) {
	    int pidx = procid_dot_x(j, size); 
	    int l_j = tolocal_dot_x(pidx, j, size);
	    int pid = rec_tlin(pidx, pidy);
	    if (pid == cid) {
		int k;
		double *ap = &pA[i][j];
		double *bp = &pB[l_i][l_j];
		for (k = 0; k < size - j; k++)
		    bp[k] = ap[k];
	    }
	    else {
		struct msg_type *msgp = &msg1[pidx];
		double *mp = &msgp->data[msgp->size];
		int k;
		double *ap = &pA[i][j];
		if (msgp->size == 0)
		    msgp->adr = &pB[l_i][l_j];
		for (k = 0; k < size - j; k++)
		    mp[k] = ap[k];
		msgp->size += size - j;
	    }
	}

	for (j = 0; j < NUM_PROC_X; j++) {
	    struct msg_type *msgp = &msg1[j];
	    int pid = rec_tlin(j, pidy);
	    if (pid == cid) {
		pB[l_i][nrx] = a[i];
	    }
	    else {
		if (msgp->size == 0) {
		    int size_x = num_in_proc_dot_x(j, size);
		    msgp->adr = &pB[l_i][size_x];
		}
		msgp->data[msgp->size] = a[i];
		msgp->size++;
	    }
	    if (msgp->size > 0) {
		CJrmwrite(msgp->data, rec_tlin(j, pidy),
			  msgp->adr, msgp->size * sizeof(double));
	    }
	}
    }
    CJbarrier();
}


void
trans_dot_block_mat_vec(cid, size, pA, pB)
    int cid;
    int size;
    dot_mat pA;
    mat pB;
{
    int i;
    int cidx, cidy;
    int nsx, nsy;

    lin_trec(cid, cidx, cidy);
    nsx = num_in_proc_dot_x(cidx, size);
    nsy = num_in_proc_dot_y(cidy, size);

    for (i = 0; i < nsy; i++) {
	int j;
	int g_i = toglobal_dot_y(cidy, i, size);
	int pid = procid_block(g_i, size); 
	int l_i = tolocal_block(pid, g_i, size);

	for (j = 0; j < nsx - BLOCK_SIZE + 1; j += BLOCK_SIZE) {
	    int g_j = toglobal_dot_x(cidx, j, size);
	    if (pid == cid) {
		/* ***** WARNING ***** */
		/* this code depends on BLOCK_SIZE */
		pB[l_i][g_j] = pA[i][j];
		pB[l_i][g_j + 1] = pA[i][j + 1];
		pB[l_i][g_j + 2] = pA[i][j + 2];
		pB[l_i][g_j + 3] = pA[i][j + 3];
		pB[l_i][g_j + 4] = pA[i][j + 4];
		pB[l_i][g_j + 5] = pA[i][j + 5];
		pB[l_i][g_j + 6] = pA[i][j + 6];
		pB[l_i][g_j + 7] = pA[i][j + 7];
	    }
	    else {
		CJrmwrite(&pA[i][j], pid,
			  &pB[l_i][g_j], BLOCK_SIZE * sizeof(double));
	    }
	}
	if (j < nsx) {
	    int g_j = toglobal_dot_x(cidx, j, size);
	    if (pid == cid) {
		double *ap = &pA[i][j];
		double *bp = &pB[l_i][g_j];
		bcopy(ap, bp, (nsx - j) * sizeof(double));
	    }
	    else {
		CJrmwrite(&pA[i][j], pid,
			  &pB[l_i][g_j], (nsx - j) * sizeof(double));
	    }
	}
	if (cidx == (NUM_PROC_X - 1)) {
	    if (pid == cid) {
		pB[l_i][size] = pA[i][nsx];
	    }
	    else {
		CJrmwrite(&pA[i][nsx], pid,
			  &pB[l_i][size], sizeof(double));
	    }
	}
    }
    CJbarrier();
}
