/*
 *  Search 8 pivot rows
 */

static char rcsid[] = "$Id: pivot8.c,v 1.6 1998/07/31 01:59:31 tatebe Exp $";

#include "dot.h"

#include <sys/types.h>
#include <math.h>
#include <stdio.h>
#ifndef __GNUC__
#include <assert.h>
#else
#define assert(x)
#endif

#include "conf.h"
#include "dist.h"
#include "lu.h"
#include "misc.h"

/*  pivot information  */

extern int piv[N];
extern int l_piv[NUM_PER_PROC_Y];
extern double inverse_piv[N];

/*  array for index conversion  */

extern int g_to_l[N];

/*  receive buffer  */

double piv_A_8[2][8][NUM_PER_PROC_X + 1];
double buf_8[2][8 * NUM_PER_PROC_Y];

/*  pivot rows in active 8 columns */

double *ppivp[8];
double ppiv[8][8];


void
search_max1(pA, st_j, end_j, local_ix, tmp_max, piv_i1)
    dot_mat pA;
    int st_j, end_j;
    int local_ix;
    double *tmp_max;
    int *piv_i1;
{
    int j;
    double l_max = 0;
    int p = 0;
    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double tmp_A, abs_A;
	tmp_A = pA[piv_j][local_ix];

	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i1 = p;
}

void
search_max2(pA, st_j, end_j, local_ix, tmp_max, piv_i2)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i2;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[0][0];
    double pp1 = ppivp[0][1];

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= c * pp1;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i2 = p;
}

void
search_max3(pA, st_j, end_j, local_ix, tmp_max, piv_i3)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i3;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[1][1];
    double pp1 = ppivp[0][2];
    double pp2;
    double p1;

    /* update the second pivot row */
    p1 = ppivp[1][0];
    pp2 = ppivp[1][2] -= p1 * pp1;
    for (j = 3; j < 8; j++)
	ppivp[1][j] -= p1 * ppivp[0][j];

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix + 1];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= ap[-1] * pp1 + c * pp2;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i3 = p;
}

void
search_max4(pA, st_j, end_j, local_ix, tmp_max, piv_i4)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i4;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[2][2];
    double pp1 = ppivp[0][3];
    double pp2 = ppivp[1][3];
    double pp3;
    double p1, p2;

    /* update the third pivot row */
    p1 = ppivp[2][0];
    p2 = ppivp[2][1];
    pp3 = ppivp[2][3] -=
	p1 * pp1 + p2 * pp2;
    for (j = 4; j < 8; j++)
	ppivp[2][j] -=
	    p1 * ppivp[0][j] + p2 * ppivp[1][j];

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix + 2];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= 
	    ap[-2] * pp1 + ap[-1] * pp2 +
	    c * pp3;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i4 = p;
}

void
search_max5(pA, st_j, end_j, local_ix, tmp_max, piv_i5)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i5;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[3][3];
    double pp1 = ppivp[0][4];
    double pp2 = ppivp[1][4];
    double pp3 = ppivp[2][4];
    double pp4;
    double p1, p2, p3;

    /* update the forth pivot row */
    p1 = ppivp[3][0];
    p2 = ppivp[3][1];
    p3 = ppivp[3][2];
    pp4 = ppivp[3][4] -=
	p1 * pp1 + p2 * pp2 +
	p3 * pp3;
    for (j = 5; j < 8; j++)
	ppivp[3][j] -=
	    p1 * ppivp[0][j] + p2 * ppivp[1][j] +
	    p3 * ppivp[2][j];

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix + 3];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= 
	    ap[-3] * pp1 + ap[-2] * pp2 +
	    ap[-1] * pp3 + c * pp4;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i5 = p;
}

void
search_max6(pA, st_j, end_j, local_ix, tmp_max, piv_i6)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i6;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[4][4];
    double pp1 = ppivp[0][5];
    double pp2 = ppivp[1][5];
    double pp3 = ppivp[2][5];
    double pp4 = ppivp[3][5];
    double pp5;
    double p1, p2, p3, p4;

    /* update the fifth pivot row */
    p1 = ppivp[4][0];
    p2 = ppivp[4][1];
    p3 = ppivp[4][2];
    p4 = ppivp[4][3];
    pp5 = ppivp[4][5] -=
	p1 * pp1 + p2 * pp2 +
	p3 * pp3 + p4 * pp4;
    for (j = 6; j < 8; j++)
	ppivp[4][j] -=
	    p1 * ppivp[0][j] + p2 * ppivp[1][j] +
	    p3 * ppivp[2][j] + p4 * ppivp[3][j];

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix + 4];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= 
	    ap[-4] * pp1 + ap[-3] * pp2 +
	    ap[-2] * pp3 + ap[-1] * pp4 +
	    c * pp5;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i6 = p;
}

void
search_max7(pA, st_j, end_j, local_ix, tmp_max, piv_i7)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i7;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[5][5];
    double pp1 = ppivp[0][6];
    double pp2 = ppivp[1][6];
    double pp3 = ppivp[2][6];
    double pp4 = ppivp[3][6];
    double pp5 = ppivp[4][6];
    double pp6;
    double p1, p2, p3, p4, p5;

    /* update the sixth pivot row */
    p1 = ppivp[5][0];
    p2 = ppivp[5][1];
    p3 = ppivp[5][2];
    p4 = ppivp[5][3];
    p5 = ppivp[5][4];
    pp6 = ppivp[5][6] -=
	p1 * pp1 + p2 * pp2 +
	p3 * pp3 + p4 * pp4 +
	p5 * pp5;
    ppivp[5][7] -=
	p1 * ppivp[0][7] + p2 * ppivp[1][7] +
	p3 * ppivp[2][7] + p4 * ppivp[3][7] +
	p5 * ppivp[4][7];

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix + 5];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= 
	    ap[-5] * pp1 + ap[-4] * pp2 +
	    ap[-3] * pp3 + ap[-2] * pp4 +
	    ap[-1] * pp5 + c * pp6;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i7 = p;
}

void
search_max8(pA, st_j, end_j, local_ix, tmp_max, piv_i8)
    dot_mat pA;
    int st_j, end_j, local_ix;
    double *tmp_max;
    int *piv_i8;
{
    int j;
    double l_max = 0;
    int p = 0;
    double inv_piv1 = ppivp[6][6];
    double pp1 = ppivp[0][7];
    double pp2 = ppivp[1][7];
    double pp3 = ppivp[2][7];
    double pp4 = ppivp[3][7];
    double pp5 = ppivp[4][7];
    double pp6 = ppivp[5][7];
    double pp7;
    double p1, p2, p3, p4, p5, p6;

    /* update the seventh pivot row */
    p1 = ppivp[6][0];
    p2 = ppivp[6][1];
    p3 = ppivp[6][2];
    p4 = ppivp[6][3];
    p5 = ppivp[6][4];
    p6 = ppivp[6][5];
    pp7 = ppivp[6][7] -=
	p1 * pp1 + p2 * pp2 +
	p3 * pp3 + p4 * pp4 +
	p5 * pp5 + p6 * pp6;

    for (j = st_j; j < end_j; j++) {
	int piv_j = l_piv[j];
	double c;
	double tmp_A, abs_A;
	double *ap = &pA[piv_j][local_ix + 6];

	c = ap[0] * inv_piv1;
	ap[0] = c;
	tmp_A = ap[1] -= 
	    ap[-6] * pp1 + ap[-5] * pp2 +
	    ap[-4] * pp3 + ap[-3] * pp4 +
	    ap[-2] * pp5 + ap[-1] * pp6 +
	    c * pp7;
	abs_A = fabs(tmp_A);
	if (l_max < abs_A) {
	    l_max = abs_A;
	    p = j;
	}
    }

    *tmp_max = l_max;
    *piv_i8 = p;
}

void
p_pivot_8_y(cid, size, pA, i, st_j, pivt, py)
    int cid, size;
    dot_mat pA;
    int i, st_j;
    double *pivt;
    double *py;
{
    int local_ix;
    double local_max = 0;
    int cidx, cidy;
    double *pp;
    int piv_j, piv_i;
    int l_j = 0;
    int end_j;
    
    lin_trec(cid, cidx, cidy);
    local_ix = tolocal_x(cidx, i, size);
    end_j = num_in_proc_y(cidy, size);

    /* search first pivot row */
    search_max1(pA, st_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[0][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i], &local_max, pp);

    if (procid_y(piv[i], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + i * SKIP,
		     &piv_j, sizeof(int));

	ppivp[0] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j];
	l_piv[st_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[0] = pp;
    }

#ifndef __GNUC__
    assert(pp[0] != 0.0);
#endif

    inverse_piv[i] = ppivp[0][0] = 1 / ppivp[0][0];

    /* delete seven colomns & search second pivot row */
    search_max2(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 1] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[1][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 1], &local_max, pp);

    if (procid_y(piv[i + 1], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 1) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[1] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[1] = pp;
    }

#ifndef __GNUC__
    assert(pp[1] != 0.0);
#endif
    pivt[0] = pp[0];

    inverse_piv[i + 1] = ppivp[1][1] = 1 / ppivp[1][1];

    /* delete six colomns & search third pivot row */
    search_max3(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 2] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[2][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 2], &local_max, pp);

    if (procid_y(piv[i + 2], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 2) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[2] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[2] = pp;
    }

#ifndef __GNUC__
    assert(pp[2] != 0.0);
#endif
    pivt[1] = pp[0];
    pivt[2] = pp[1];

    inverse_piv[i + 2] = ppivp[2][2] = 1 / ppivp[2][2];

    /* delete five colomns & search forth pivot row */
    search_max4(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 3] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[3][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 3], &local_max, pp);

    if (procid_y(piv[i + 3], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 3) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[3] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[3] = pp;
    }

#ifndef __GNUC__
    assert(pp[3] != 0.0);
#endif
    pivt[3] = pp[0];
    pivt[4] = pp[1];
    pivt[5] = pp[2];

    inverse_piv[i + 3] = ppivp[3][3] = 1 / ppivp[3][3];

    /* delete four colomns & search fifth pivot row */
    search_max5(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 4] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[4][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 4], &local_max, pp);

    if (procid_y(piv[i + 4], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 4) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[4] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[4] = pp;
    }
#ifndef __GNUC__
    assert(pp[4] != 0.0);
#endif
    pivt[6] = pp[0];
    pivt[7] = pp[1];
    pivt[8] = pp[2];
    pivt[9] = pp[3];

    inverse_piv[i + 4] = ppivp[4][4] = 1 / ppivp[4][4];

    /* delete three colomns & search sixth pivot row */
    search_max6(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 5] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[5][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 5], &local_max, pp);

    if (procid_y(piv[i + 5], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 5) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[5] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[5] = pp;
    }

#ifndef __GNUC__
    assert(pp[5] != 0.0);
#endif
    pivt[10] = pp[0];
    pivt[11] = pp[1];
    pivt[12] = pp[2];
    pivt[13] = pp[3];
    pivt[14] = pp[4];

    inverse_piv[i + 5] = ppivp[5][5] = 1 / ppivp[5][5];

    /* delete two colomns & search seventh pivot row */
    search_max7(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 6] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[6][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 6], &local_max, pp);

    if (procid_y(piv[i + 6], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 6) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[6] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[6] = pp;
    }

#ifndef __GNUC__
    assert(pp[6] != 0.0);
#endif
    pivt[15] = pp[0];
    pivt[16] = pp[1];
    pivt[17] = pp[2];
    pivt[18] = pp[3];
    pivt[19] = pp[4];
    pivt[20] = pp[5];

    inverse_piv[i + 6] = ppivp[6][6] = 1 / ppivp[6][6];

    /* delete one colomn & search final pivot row */
    search_max8(pA, st_j + l_j, end_j, local_ix, &local_max, &piv_j);
    piv_i = l_piv[piv_j];
    piv[i + 7] = toglobal_y(cidy, piv_i, size);
    pp = &ppiv[7][0];
    bcopy(&pA[piv_i][local_ix], pp, 8 * sizeof(double));

    max8_y(cid, &piv[i + 7], &local_max, pp);

    if (procid_y(piv[i + 7], size) == cidy) {
	int tmp_i;
	double tmp_py[8];

	broad_send_x(cid, BROAD_LOC_PIV1 + (i + 7) * SKIP,
		     &piv_j, sizeof(int));

	ppivp[7] = &pA[piv_i][local_ix];

	tmp_i = l_piv[st_j + l_j];
	l_piv[st_j + l_j] = piv_i;
	l_piv[piv_j] = tmp_i;

	if (py != NULL) {
	    bcopy(&py[8 * l_j], tmp_py, 8 * sizeof(double));
	    bcopy(&py[8 * (piv_j - st_j)], &py[8 * l_j], 8 * sizeof(double));
	    bcopy(tmp_py, &py[8 * (piv_j - st_j)], 8 * sizeof(double));
	}
	l_j++;
    }
    else {
	ppivp[7] = pp;
    }
#ifndef __GNUC__
    assert(pp[7] != 0.0);
#endif
    pivt[21] = pp[0];
    pivt[22] = pp[1];
    pivt[23] = pp[2];
    pivt[24] = pp[3];
    pivt[25] = pp[4];
    pivt[26] = pp[5];
    pivt[27] = pp[6];

    inverse_piv[i + 7] = pivt[28] = 1 / pp[7];
}

void
broad_piv(cid, size, size_x, pA, i, st_j, piv1)
    int cid, size, size_x;
    dot_mat pA;
    int i, st_j;
    double *piv1[];
{
    int cidx, cidy;
    int j;
    int local_ix, tmp_s;
    int active_f;
    int size_y;
    int which_buf = (i >> 3) & 1;

    lin_trec(cid, cidx, cidy);
    local_ix = tolocal_x(cidx, i, size);
    active_f = (procid_x(i, size) == cidx);
    size_y = num_in_proc_y(cidy, size);

    if (active_f) {
	tmp_s = local_ix + 8;
    }
    else {
	tmp_s = g_to_l[i];
    }

    for (j = 0; j < 8; j++) {
	int piv_i1 = piv[i + j];
	if (procid_y(piv_i1, size) == cidy) {
	    int local_i = tolocal_y(cidy, piv_i1, size);
	    double *ap = &pA[local_i][tmp_s];
	    broad_bsend_y(cid, BROAD_PIV_1 + (i + j) * SKIP,
			 &ap[0],
			 (size_x - tmp_s + 1) * sizeof(double));
	    piv1[j] = &ap[0];
	}
	else {
#ifdef __AP__
	    piv1[j] = NULL;
#else  /** __MPI__ **/
	    piv1[j] = get_pivot8_x(cid, i, j);
#endif
	}
    }

    if (active_f) {
	double *bufp = &buf_8[which_buf][0];
	double *bp;

	for (j = st_j, bp = bufp; j < size_y; j++, bp += 8) {
	    double *ap = &pA[l_piv[j]][local_ix];

	    bp[0] = ap[0];
	    bp[1] = ap[1];
	    bp[2] = ap[2];
	    bp[3] = ap[3];
	    bp[4] = ap[4];
	    bp[5] = ap[5];
	    bp[6] = ap[6];
	    bp[7] = ap[7];
	}
	broad_bsend_x(cid, BROAD_PIV_X + i * SKIP,
		     bufp, 8 * (size_y - st_j) * sizeof(double));
	piv1[8] = bufp; /* this pointer is not referred */
    }
    else {
#ifdef __AP__
	piv1[8] = NULL;
#else  /** __MPI__ **/
	piv1[8] = get_pivot8_y(cid, i);
#endif
    }

}

double*
get_pivot8_x(cid, iter, j)
    int cid;
    int iter, j;
{
#ifdef COPY_RECV_MSG
    int which_buf = (iter >> 3) & 1;
    broad_brecv_y(cid, BROAD_PIV_1 + (iter + j) * SKIP,
		 &piv_A_8[which_buf][j][0],
		 (NUM_PER_PROC_X + 1) * sizeof(double));
    return &piv_A_8[which_buf][j][0];
#else
    return (double*)fbroad_recv_y(cid, BROAD_PIV_1 + (iter + j) * SKIP);
#endif
}

double*
get_pivot8_y(cid, i)
    int cid, i;
{
#ifdef COPY_RECV_MSG
    int which_buf = (i >> 3) & 1;
    broad_brecv_x(cid, BROAD_PIV_X + i * SKIP,
		  &buf_8[which_buf][0],
		  BLOCK_SIZE * NUM_PER_PROC_Y * sizeof(double));
    return &buf_8[which_buf][0];
#else
    return (double*)fbroad_recv_x(cid, BROAD_PIV_X + i * SKIP);
#endif
}
