/* @(#) ==== PARALLEL GAUSSIAN ELIMINATION COPYRIGHT OSAMU TATEBE 1995 ==== */

static char copyright[] = "@(#) PARALLEL GAUSSIAN ELIMINATION COPYRIGHT OSAMU TATEBE 1995";

static char rcsid[] = "$Id: gauss.c,v 1.4 1998/07/29 09:29:51 tatebe Exp $";

#define UNROLL_8		/* Unrolling */
/*#define NO_SOFTWARE_PIPE	*//* Enable software pipelining */
#define MESSAGE_VECTORIZATION	/* Vectorize messages */
/*#define NO_TILE		*//* Enable tiling execution */
/*#define MEASURE_TIME*/
/*#define MEASURE_PRECISE_TIME*/

#include "dot.h"

#include <sys/types.h>
#include <math.h>
#include <stdio.h>
#ifndef __GNUC__
#include <assert.h>
#else
#define assert(x)
#endif

#include "conf.h"
#include "dist.h"
#include "lu.h"
#include "misc.h"

#if defined(__AP__) && defined(__BROAD_PUT__)
static int put_f = 0;
static int local_f = 0;
static int dma_f = 0;
#endif

#if defined(MEASURE_TIME) || defined(MEASURE_PRECISE_TIME)
#ifdef __AP__
extern double dgettime();
#define MPI_Wtime()	dgettime()
#elif defined(__MPI__)
#include "mpi.h"
#else
#error Define either __AP__ or __MPI__
#endif
#endif /* defined(MEASURE_TIME) || defined(MEASURE_PRECISE_TIME) */

#ifdef MEASURE_TIME
#define M_TIME(s) s
#else
#define M_TIME(s)
#endif /* MEASURE_TIME */

#ifdef MEASURE_PRECISE_TIME
#define MP_TIME(s) s

static double time_core = 0.0, time_core1;
static double time_piv = 0.0, time_piv1;
#else
#define MP_TIME(s)
#endif /* MEASURE_PRECISE_TIME */

/*
 *  Pivot information
 */

int piv[N];
int l_piv[NUM_PER_PROC_Y];
double inverse_piv[N]; 

/*
 *  Arrays for index conversion 
 */

int g_to_l[N];
static int g_to_l_e[N];

/*
 *  For message vectorization of the info. of pivotting
 */

#ifdef MESSAGE_VECTORIZATION
static char piv_buf[16 * sizeof(int) + 37 * sizeof(double)];
#endif

/*
 *  Function declarations
 */

void
init_LU(/* int cid, int size */);

void
gauss(/* int cid, int size, dot_mat pA */);

void
solve_U(/* int cid, int size, dot_mat pA, double* x */);

void
solve_U_outer(/* int cid, int size, dot_mat pA, double* x */);

/*
 *  solve_matrix() solves a matrix by parallel Gaussian elimination.
 *  The distributed matrix 'pA' is passed by the block-cyclic
 *  distribution:
 *
 *	!HPF$ DISTRIBUTE (CYCLIC(8), CYCLIC)
 *
 *  The matrix 'pA' of each processor includes the coresponding
 *  elements of the right-hand vector.
 *
 *  parameters:
 *
 *	cid		the node number
 *	size		the problem size
 *	pA		the coefficient matrix
 *	global_x	the answer  (output)
 *  
 */

void
solve_matrix(cid, size, pA, global_x)
    int cid, size;
    dot_mat pA;
    double *global_x;
{
#ifdef MEASURE_TIME
    double time = MPI_Wtime();
    double time1, time2, time3;
#endif

    init_LU(cid, size);

    M_TIME(time1 = MPI_Wtime());

    gauss(cid, size, pA);

    M_TIME(time2 = MPI_Wtime());

    solve_U(cid, size, pA, global_x);
/*    solve_U_outer(cid, size, pA, global_x); */

    M_TIME(time3 = MPI_Wtime());

    MP_TIME(printf("[%d] core: %f, piv: %f\n",
		   cid, time_core, time_piv));

    M_TIME(printf("[%d] init: %f, gauss: %f, solve: %f\n", cid,
		  time1 - time, time2 - time1, time3 - time2));
}


void
get_piv(cid, i, inv_piv)
    int cid, i;
    double inv_piv[];
{
#ifndef MESSAGE_VECTORIZATION
    broad_recv_x(cid, BROAD_PIV_CELL + i * SKIP,
		 &piv[i], 8 * sizeof(int));
    broad_recv_x(cid, BROAD_INV_PIV1 + i * SKIP,
		 &inverse_piv[i], 8 * sizeof(double));
    broad_recv_x(cid, BROAD_PIV + i * SKIP,
		 inv_piv, 29 * sizeof(double));
#else /* MESSAGE_VECTORIZATION */

#ifdef COPY_RECV_MSG
    char *pp = piv_buf;
    broad_recv_x(cid, BROAD_PIV_CELL + i * SKIP,
		 pp, 8 * sizeof(int) + 37 * sizeof(double));
#else
    char *pp;
    pp = (char*)fbroad_recv_x(cid, BROAD_PIV_CELL + i * SKIP);
#endif

    bcopy(pp, &piv[i], 8 * sizeof(int));
    pp += 8 * sizeof(int);
    bcopy(pp, &inverse_piv[i], 8 * sizeof(double));
    pp += 8 * sizeof(double);
    bcopy(pp, inv_piv, 29 * sizeof(double));
#endif
}

void
change_piv(cid, i, st_j)
    int cid, i, st_j;
{
    int j, l_j = 0;
    int cidx, cidy;

    lin_trec(cid, cidx, cidy);

    for (j = 0; j < 8; j++) {
	if (procid_y(piv[i + j], size) == cidy) {
	    int tmp_i, tmp_j;

#ifdef COPY_RECV_MSG
	    broad_recv_x(cid, BROAD_LOC_PIV1 + (i + j) * SKIP,
			 &tmp_j, sizeof(int));
#else
	    tmp_j = *((int*)fbroad_recv_x(cid, BROAD_LOC_PIV1 + (i + j) * SKIP));
#endif

	    tmp_i = l_piv[st_j + l_j];
	    l_piv[st_j + l_j] = l_piv[tmp_j];
	    l_piv[tmp_j] = tmp_i;
	    l_j++;
	}
    }
}

void
broad_piv_i(cid, i, inv_piv)
    int cid, i;
    double inv_piv[];
{
#ifndef MESSAGE_VECTORIZATION
    broad_send_x(cid, BROAD_PIV_CELL + i * SKIP,
		 &piv[i], 8 * sizeof(int));
    broad_send_x(cid, BROAD_INV_PIV1 + i * SKIP,
		 &inverse_piv[i], 8 * sizeof(double));
    broad_send_x(cid, BROAD_PIV + i * SKIP,
		 inv_piv, 29 * sizeof(double));
#else /* MESSAGE_VECTORIZATION */
    char *pp = piv_buf;

    bcopy(&piv[i], pp, 8 * sizeof(int));
    pp += 8 * sizeof(int);
    bcopy(&inverse_piv[i], pp, 8 * sizeof(double));
    pp += 8 * sizeof(double);
    bcopy(inv_piv, pp, 29 * sizeof(double));

    broad_send_x(cid, BROAD_PIV_CELL + i * SKIP,
		 piv_buf, 8 * sizeof(int) + 37 * sizeof(double));
#endif
}

void
get_piv_1(cid, i, st_j, py)
    int cid, i, st_j;
    double *py;
{
    int cidx, cidy;

#ifndef MESSAGE_VECTORIZATION
    lin_trec(cid, cidx, cidy);
    broad_recv_x(cid, BROAD_PIV_CELL + i * SKIP,
		 &piv[i], sizeof(int));
    broad_recv_x(cid, BROAD_INV_PIV1 + i * SKIP,
		 &inverse_piv[i], sizeof(double));
#else /* MESSAGE_VECTORIZATION */
    double *dp = (double*)piv_buf;
    int *ip;

    lin_trec(cid, cidx, cidy);
#ifdef COPY_RECV_MSG
    broad_recv_x(cid, BROAD_PIV_CELL + i * SKIP,
		 piv_buf, sizeof(int) + sizeof(double));
#else
    dp = (double*)fbroad_recv_x(cid, BROAD_PIV_CELL + i * SKIP);
#endif

    inverse_piv[i] = dp[0];
    ip = (int*)&dp[1];
    piv[i] = ip[0];
#endif
    if (procid_y(piv[i], size) == cidy) {
	int tmp_j, tmp_i;
	double tmp_py;

#ifdef COPY_RECV_MSG
	broad_recv_x(cid, BROAD_LOC_PIV1 + i * SKIP,
		     &tmp_j, sizeof(int));
#else
	tmp_j = *((int*)fbroad_recv_x(cid, BROAD_LOC_PIV1 + i * SKIP));
#endif

	tmp_i = l_piv[st_j];
	l_piv[st_j] = l_piv[tmp_j];
	l_piv[tmp_j] = tmp_i;

	if (py != NULL) {
	    tmp_py = py[0];
	    py[0] = py[tmp_j - st_j];
	    py[tmp_j - st_j] = tmp_py;
	}
    }
}

void
broad_piv_i_1(cid, i)
    int cid, i;
{
    int cidx, cidy;

#ifndef MESSAGE_VECTORIZATION
    lin_trec(cid, cidx, cidy);
    broad_send_x(cid, BROAD_PIV_CELL + i * SKIP,
		 &piv[i], sizeof(int));
    broad_send_x(cid, BROAD_INV_PIV1 + i * SKIP,
		 &inverse_piv[i], sizeof(double));
#else /* MESSAGE_VECTORIZATION */
    int *ip = (int*)piv_buf;
    double *dp;

    lin_trec(cid, cidx, cidy);
    dp = (double*)&ip[0];
    dp[0] = inverse_piv[i];
    ip = (int*)&dp[1];
    ip[0] = piv[i];

    broad_send_x(cid, BROAD_PIV_CELL + i * SKIP,
		 piv_buf, sizeof(int) + sizeof(double));
#endif
}


/*
 *  Gaussian elimination.  Elements of the lower triangular part of
 *  the dot_mat 'pA' are eliminated and an upper triangular matrix
 *  that is solved by 'solve_U()' is produced.
 */

void
gauss(cid, size, pA)
    int cid;
    int size;
    dot_mat pA;
{
    int i;
    double inv_piv[29];
    double *piv_r[9];
    int cidx, cidy;
    int num_data_x, num_data_y;
    int st_j = 0;

    lin_trec(cid, cidx, cidy);
    num_data_x = num_in_proc_x(cidx, size);
    num_data_y = num_in_proc_y(cidy, size);

#ifdef UNROLL_8	/* Unrolled by 8 */

#ifdef NO_SOFTWARE_PIPE

    for (i = 0; i < size - 8; i += 8) {
	int j, k;
	register double inv_pivot;
	double *pr0, *pr1, *pr2, *pr3;
	double *pr4, *pr5, *pr6, *pr7;
	double *py;
	double a1, a2, a3;
	double a4, a5, a6, a7;
	int local_ix = tolocal_x(cidx, i, size);
	int tmp_s, num_x;
	int active_f = (procid_x(i, size) == cidx);

	if (active_f) {

	    MP_TIME(time_piv1 = MPI_Wtime());
	    p_pivot_8_y(cid, size, pA, i, st_j, inv_piv, NULL);
	    MP_TIME(time_piv += MPI_Wtime() - time_piv1);

	    broad_piv_i(cid, i, inv_piv);
	}
	else {
	    get_piv(cid, i, inv_piv);
	    change_piv(cid, i, st_j);
	}
	for (j = 0; j < 8; j++) {
	    if (procid_y(piv[i + j], size) == cidy) {
		assert(tolocal_y(cidy, piv[i + j], size) == l_piv[st_j]);
		st_j++;
	    }
	}
	broad_piv(cid, size, num_data_x, pA, i, st_j, piv_r);

	if (active_f) {
	    tmp_s = local_ix + 8;
	}
	else {
	    tmp_s = g_to_l[i];
	}
	num_x = num_data_x - tmp_s;

	pr0 = piv_r[0]; pr1 = piv_r[1];
	pr2 = piv_r[2]; pr3 = piv_r[3];
	pr4 = piv_r[4]; pr5 = piv_r[5];
	pr6 = piv_r[6]; pr7 = piv_r[7];
	py = piv_r[8];
	a1 = inv_piv[0]; a2 = inv_piv[1];
	a3 = inv_piv[2]; a4 = inv_piv[3];
	a5 = inv_piv[4]; a6 = inv_piv[5];

	/*  update 2nd, 3rd & 4th pivot data  */
	if (pr0 == NULL) {
	    pr0 = get_pivot8_x(cid, i, 0);
	}
	if (pr1 == NULL) {
	    pr1 = get_pivot8_x(cid, i, 1);
	}
	if (pr2 == NULL) {
	    pr2 = get_pivot8_x(cid, i, 2);
	}
	if (pr3 == NULL) {
	    pr3 = get_pivot8_x(cid, i, 3);
	}
	for (k = 0; k <= num_x; k++) {
	    double p = pr0[k];
	    double q = pr1[k] - a1 * p;
	    double r = pr2[k] - a2 * p - a3 * q;
	    double s = pr3[k] - a4 * p - a5 * q - a6 * r;
	    pr1[k] = q;
	    pr2[k] = r;
	    pr3[k] = s;
	}

	a1 = inv_piv[6]; a2 = inv_piv[7];
	a3 = inv_piv[8]; a4 = inv_piv[9];

	/*  update 5th pivot data  */
	if (pr4 == NULL) {
	    pr4 = get_pivot8_x(cid, i, 4);
	}
	for (k = 0; k <= num_x; k++)
	    pr4[k] -=
		a1 * pr0[k] +
		a2 * pr1[k] +
		a3 * pr2[k] +
		a4 * pr3[k];
	a1 = inv_piv[10];
	a2 = inv_piv[11];
	a3 = inv_piv[12];
	a4 = inv_piv[13];
	a5 = inv_piv[14];
	/*  update 6th pivot data  */
	if (pr5 == NULL) {
	    pr5 = get_pivot8_x(cid, i, 5);
	}
	for (k = 0; k <= num_x; k++)
	    pr5[k] -=
		a1 * pr0[k] +
		a2 * pr1[k] +
		a3 * pr2[k] +
		a4 * pr3[k] +
		a5 * pr4[k];
	a1 = inv_piv[15];
	a2 = inv_piv[16];
	a3 = inv_piv[17];
	a4 = inv_piv[18];
	a5 = inv_piv[19];
	a6 = inv_piv[20];
	/*  update 7th pivot data  */
	if (pr6 == NULL) {
	    pr6 = get_pivot8_x(cid, i, 6);
	}
	for (k = 0; k <= num_x; k++)
	    pr6[k] -=
		a1 * pr0[k] +
		a2 * pr1[k] +
		a3 * pr2[k] +
		a4 * pr3[k] +
		a5 * pr4[k] +
		a6 * pr5[k];
	a1 = inv_piv[21];
	a2 = inv_piv[22];
	a3 = inv_piv[23];
	a4 = inv_piv[24];
	a5 = inv_piv[25];
	a6 = inv_piv[26];
	a7 = inv_piv[27];
	inv_pivot = inv_piv[28];
	/*  update 8th pivot data  */
	if (pr7 == NULL) {
	    pr7 = get_pivot8_x(cid, i, 7);
	}
	for (k = 0; k <= num_x; k++)
	    pr7[k] -=
		a1 * pr0[k] +
		a2 * pr1[k] +
		a3 * pr2[k] +
		a4 * pr3[k] +
		a5 * pr4[k] +
		a6 * pr5[k] +
		a7 * pr6[k];

	if (py == NULL) {
	    py = get_pivot8_y(cid, i);
	}

	MP_TIME(time_core1 = MPI_Wtime());

	if (active_f) {
	    int tile_j, tile_k;

#ifdef NO_TILE
	    for (j = st_j; j < num_data_y; j++) {
		int piv_j = l_piv[j];
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[piv_j][tmp_s];

		c1 = p[-8]; c2 = p[-7];
		c3 = p[-6]; c4 = p[-5];
		c5 = p[-4]; c6 = p[-3];
		c7 = p[-2];
		c8 = p[-1] * inv_pivot;

		for (k = 0; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
#else
	    for (tile_j = 0; tile_j < (num_data_y - st_j) / TILE_J; tile_j++) {
		int tj = tile_j * TILE_J;
		int *l_pivp = &l_piv[st_j + tj];
		for (tile_k = 0; tile_k < (num_x + 1) / TILE_K; tile_k++) {
		    int tk = tile_k * TILE_K;

		    for (j = 0; j < TILE_J; j++) {
			int piv_j = l_pivp[j];
			double c1, c2, c3, c4;
			double c5, c6, c7, c8;
			double *p = &pA[piv_j][tmp_s];
			double *lp = &p[tk];

			c1 = p[-8]; c2 = p[-7];
			c3 = p[-6]; c4 = p[-5];
			c5 = p[-4]; c6 = p[-3];
			c7 = p[-2];
			c8 = p[-1] * inv_pivot;

			for (k = 0; k < TILE_K; k++)
			    lp[k] -=
				c1 * pr0[tk + k] +
				c2 * pr1[tk + k] +
				c3 * pr2[tk + k] +
				c4 * pr3[tk + k] +
				c5 * pr4[tk + k] +
				c6 * pr5[tk + k] +
				c7 * pr6[tk + k] +
				c8 * pr7[tk + k];
		    }
		}
	    }
	    for (j = st_j; j < tile_j * TILE_J + st_j; j++) {
		int piv_j = l_piv[j];
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[piv_j][tmp_s];

		c1 = p[-8]; c2 = p[-7];
		c3 = p[-6]; c4 = p[-5];
		c5 = p[-4]; c6 = p[-3];
		c7 = p[-2];
		c8 = p[-1] * inv_pivot;

		for (k = tile_k * TILE_K; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] +
			c2 * pr1[k] +
			c3 * pr2[k] +
			c4 * pr3[k] +
			c5 * pr4[k] +
			c6 * pr5[k] +
			c7 * pr6[k] +
			c8 * pr7[k];
	    }

	    for (; j < num_data_y; j++) {
		int piv_j = l_piv[j];
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[piv_j][tmp_s];

		c1 = p[-8];
		c2 = p[-7];
		c3 = p[-6];
		c4 = p[-5];
		c5 = p[-4];
		c6 = p[-3];
		c7 = p[-2];
		c8 = p[-1] * inv_pivot;

		for (k = 0; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] +
			c2 * pr1[k] +
			c3 * pr2[k] +
			c4 * pr3[k] +
			c5 * pr4[k] +
			c6 * pr5[k] +
			c7 * pr6[k] +
			c8 * pr7[k];
	    }
#endif
	}
	else {
	    int tile_j, tile_k;
	    double *pyp = py;

#ifdef NO_TILE
	    for (j = st_j; j < num_data_y; j++, pyp += 8) {
		int piv_j = l_piv[j];
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[piv_j][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6];
		c8 = pyp[7] * inv_pivot;

		for (k = 0; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
#else
	    for (tile_j = 0; tile_j < (num_data_y - st_j) / TILE_J; tile_j++) {
		int tj = tile_j * TILE_J;
		double *py0 = &py[8 * tj];
		int *l_pivp = &l_piv[st_j + tj];

		for (tile_k = 0; tile_k < (num_x + 1) / TILE_K; tile_k++) {
		    int tk = tile_k * TILE_K;
		    double *py1 = py0;

		    for (j = 0; j < TILE_J; j++, py1 += 8) {
			int piv_j = l_pivp[j];
			double c1, c2, c3, c4;
			double c5, c6, c7, c8;
			double *p = &pA[piv_j][tmp_s];
			double *lp = &p[tk];

			c1 = py1[0]; c2 = py1[1];
			c3 = py1[2]; c4 = py1[3];
			c5 = py1[4]; c6 = py1[5];
			c7 = py1[6];
			c8 = py1[7] * inv_pivot;

			for (k = 0; k < TILE_K; k++)
			    lp[k] -=
				c1 * pr0[tk + k] +
				c2 * pr1[tk + k] +
				c3 * pr2[tk + k] +
				c4 * pr3[tk + k] +
				c5 * pr4[tk + k] +
				c6 * pr5[tk + k] +
				c7 * pr6[tk + k] +
				c8 * pr7[tk + k];
		    }
		}
	    }
	    for (j = st_j; j < tile_j * TILE_J + st_j; j++, pyp += 8) {
		int piv_j = l_piv[j];
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[piv_j][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6];
		c8 = pyp[7] * inv_pivot;

		for (k = tile_k * TILE_K; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
	    for (; j < num_data_y; j++, pyp += 8) {
		int piv_j = l_piv[j];
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[piv_j][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6];
		c8 = pyp[7] * inv_pivot;

		for (k = 0; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
#endif
	}

	MP_TIME(time_core += MPI_Wtime() - time_core1);

	/** MPI_Barrier(MPI_COMM_WORLD); **/

    }

#else /* SOFTWARE_PIPE */

    i = 0;
    if (i < size - 8) {
	/* prologue */
	int j;
	int active_f = (procid_x(i, size) == cidx);
	if (active_f) {

	    MP_TIME(time_piv1 = MPI_Wtime());
	    p_pivot_8_y(cid, size, pA, i, st_j, inv_piv, NULL);
	    MP_TIME(time_piv += MPI_Wtime() - time_piv1);

	    broad_piv_i(cid, i, inv_piv);
	}
	else {
	    get_piv(cid, i, inv_piv);
	    change_piv(cid, i, st_j);
	}
	for (j = 0; j < 8; j++) {
	    if (procid_y(piv[i + j], size) == cidy) {
		assert(tolocal_y(cidy, piv[i + j], size) == l_piv[st_j]);
		st_j++;
	    }
	}
	broad_piv(cid, size, num_data_x, pA, i, st_j, piv_r);
    }

    for (; i < size - 16; i += 8) {
	int k;
	register double inv_pivot;
	double *pr0, *pr1, *pr2, *pr3;
	double *pr4, *pr5, *pr6, *pr7; 
	double *py;
	double a1, a2, a3;
	double a4, a5, a6, a7;
	int tmp_s, num_x;
	int active_f = (procid_x(i, size) == cidx);
	int next_active_f = (procid_x(i + 8, size) == cidx);

	if (active_f) {
	    tmp_s = tolocal_x(cidx, i, size) + 8;
	}
	else {
	    tmp_s = g_to_l[i];
	}
	num_x = num_data_x - tmp_s;
	pr0 = piv_r[0]; pr1 = piv_r[1];
	pr2 = piv_r[2]; pr3 = piv_r[3];
	pr4 = piv_r[4]; pr5 = piv_r[5];
	pr6 = piv_r[6]; pr7 = piv_r[7];
	py = piv_r[8];
	a1 = inv_piv[0]; a2 = inv_piv[1];
	a3 = inv_piv[2]; a4 = inv_piv[3];
	a5 = inv_piv[4]; a6 = inv_piv[5];

	/*  update 2nd, 3rd & 4th pivot data  */
	if (pr0 == NULL) {
	    pr0 = get_pivot8_x(cid, i, 0);
	}
	if (pr1 == NULL) {
	    pr1 = get_pivot8_x(cid, i, 1);
	}
	if (pr2 == NULL) {
	    pr2 = get_pivot8_x(cid, i, 2);
	}
	if (pr3 == NULL) {
	    pr3 = get_pivot8_x(cid, i, 3);
	}
	for (k = 0; k <= num_x; k++) {
	    double p = pr0[k];
	    double q = pr1[k] - a1 * p;
	    double r = pr2[k] - a2 * p - a3 * q;
	    double s = pr3[k] - a4 * p - a5 * q - a6 * r;
	    pr1[k] = q;
	    pr2[k] = r;
	    pr3[k] = s;
	}

	a1 = inv_piv[6]; a2 = inv_piv[7];
	a3 = inv_piv[8]; a4 = inv_piv[9];

	/*  update 5th pivot data  */
	if (pr4 == NULL) {
	    pr4 = get_pivot8_x(cid, i, 4);
	}
	for (k = 0; k <= num_x; k++)
	    pr4[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k];

	a1 = inv_piv[10]; a2 = inv_piv[11];
	a3 = inv_piv[12]; a4 = inv_piv[13];
	a5 = inv_piv[14];

	/*  update 6th pivot data  */
	if (pr5 == NULL) {
	    pr5 = get_pivot8_x(cid, i, 5);
	}
	for (k = 0; k <= num_x; k++)
	    pr5[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k] +
		a5 * pr4[k];

	a1 = inv_piv[15]; a2 = inv_piv[16];
	a3 = inv_piv[17]; a4 = inv_piv[18];
	a5 = inv_piv[19]; a6 = inv_piv[20];

	/*  update 7th pivot data  */
	if (pr6 == NULL) {
	    pr6 = get_pivot8_x(cid, i, 6);
	}
	for (k = 0; k <= num_x; k++)
	    pr6[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k] +
		a5 * pr4[k] + a6 * pr5[k];

	a1 = inv_piv[21]; a2 = inv_piv[22];
	a3 = inv_piv[23]; a4 = inv_piv[24];
	a5 = inv_piv[25]; a6 = inv_piv[26];
	a7 = inv_piv[27];
	inv_pivot = inv_piv[28];

	/*  update 8th pivot data  */
	if (pr7 == NULL) {
	    pr7 = get_pivot8_x(cid, i, 7);
	}
	for (k = 0; k <= num_x; k++)
	    pr7[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k] +
		a5 * pr4[k] + a6 * pr5[k] +
		a7 * pr6[k];

	if (py == NULL) {
	    py = get_pivot8_y(cid, i);
	}

	if (next_active_f) {
	    int j;
	    int tj, tk;
	    double *pyp = py;
	    int l;

	    for (j = st_j; j < num_data_y; j++, pyp += 8) {
		int k;
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6];
		c8 = pyp[7] * inv_pivot;
		pyp[7] = c8;

		for (k = 0; k < 8; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }

	    MP_TIME(time_piv1 = MPI_Wtime());
	    p_pivot_8_y(cid, size, pA, i + 8, st_j, inv_piv, py);
	    MP_TIME(time_piv += MPI_Wtime() - time_piv1);

	    broad_piv_i(cid, i + 8, inv_piv);

	    pyp = py;
	    for (l = 0; l < 8; l++) {
		int piv_i = piv[i + 8 + l];
		if (procid_y(piv_i, size) == cidy) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    int local_i = tolocal_y(cidy, piv_i, size);
		    double *p = &pA[local_i][tmp_s];
		    assert(l_piv[st_j] == local_i);
		    c1 = pyp[0]; c2 = pyp[1];
		    c3 = pyp[2]; c4 = pyp[3];
		    c5 = pyp[4]; c6 = pyp[5];
		    c7 = pyp[6]; c8 = pyp[7];

		    for (k = 8; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		    pyp += 8;
		    st_j++;
		}
	    }

	    broad_piv(cid, size, num_data_x, pA, i + 8, st_j, piv_r);

	    MP_TIME(time_core1 = MPI_Wtime());

#ifdef NO_TILE
	    for (j = st_j; j < num_data_y; j++, pyp += 8) {
		int k;
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6]; c8 = pyp[7];

		for (k = 8; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
#else /* NO_TILE */
	    for (tj = 0; tj < num_data_y - st_j - TILE_J + 1; tj += TILE_J) {
		double *py0 = &pyp[8 * tj];
		int *l_pivp = &l_piv[st_j + tj];

		for (tk = 8; tk < num_x + 2 - TILE_K; tk += TILE_K) {
		    int j;
		    double *py1 = py0;
		    double *p0, *p1, *p2, *p3;
		    double *p4, *p5, *p6, *p7;
		    int ttk = tmp_s + tk;

		    p0 = &pr0[tk]; p1 = &pr1[tk];
		    p2 = &pr2[tk]; p3 = &pr3[tk];
		    p4 = &pr4[tk]; p5 = &pr5[tk];
		    p6 = &pr6[tk]; p7 = &pr7[tk];

		    for (j = 0; j < TILE_J; j++, py1 += 8) {
			int k;
			double c1, c2, c3, c4;
			double c5, c6, c7, c8;
			double *lp = &pA[l_pivp[j]][ttk];

			c1 = py1[0]; c2 = py1[1];
			c3 = py1[2]; c4 = py1[3];
			c5 = py1[4]; c6 = py1[5];
			c7 = py1[6]; c8 = py1[7];

			for (k = 0; k < TILE_K; k++)
			    lp[k] -=
				c1 * p0[k] + c2 * p1[k] +
				c3 * p2[k] + c4 * p3[k] +
				c5 * p4[k] + c6 * p5[k] +
				c7 * p6[k] + c8 * p7[k];
		    }
		}
	    }
	    for (j = st_j; j < tj + st_j; j++, pyp += 8) {
		int k;
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6]; c8 = pyp[7];

		for (k = tk; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
	    for (; j < num_data_y; j++, pyp += 8) {
		int k;
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6]; c8 = pyp[7];

		for (k = 8; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
#endif /* NO_TILE */

	    MP_TIME(time_core += MPI_Wtime() - time_core1);

	}
	else {
	    int save_j, new_st_j;

	    if (active_f) {
		int j;
		int tj, tk;
		int l;

		MP_TIME(time_core1 = MPI_Wtime());

#ifdef NO_TILE
		for (j = st_j; j < num_data_y; j++) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = p[-8]; c2 = p[-7];
		    c3 = p[-6]; c4 = p[-5];
		    c5 = p[-4]; c6 = p[-3];
		    c7 = p[-2];
		    c8 = p[-1] * inv_pivot;
		    for (k = 0; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		}
#else /* NO_TILE */
		for (tj = st_j, tk = 0; tj < num_data_y - TILE_J + 1; tj += TILE_J) {
		    int *l_pivp = &l_piv[tj];

		    for (tk = 0; tk < num_x + 2 - TILE_K; tk += TILE_K) {
			int j;
			double *p0, *p1, *p2, *p3;
			double *p4, *p5, *p6, *p7;

			p0 = &pr0[tk]; p1 = &pr1[tk];
			p2 = &pr2[tk]; p3 = &pr3[tk];
			p4 = &pr4[tk]; p5 = &pr5[tk];
			p6 = &pr6[tk]; p7 = &pr7[tk];

			for (j = 0; j < TILE_J; j++) {
			    int k;
			    double c1, c2, c3, c4;
			    double c5, c6, c7, c8;
			    double *p = &pA[l_pivp[j]][tmp_s];
			    double *lp = &p[tk];

			    c1 = p[-8]; c2 = p[-7];
			    c3 = p[-6]; c4 = p[-5];
			    c5 = p[-4]; c6 = p[-3];
			    c7 = p[-2];
			    c8 = p[-1] * inv_pivot;

			    for (k = 0; k < TILE_K; k++)
				lp[k] -=
				    c1 * p0[k] + c2 * p1[k] +
				    c3 * p2[k] + c4 * p3[k] +
				    c5 * p4[k] + c6 * p5[k] +
				    c7 * p6[k] + c8 * p7[k];
			}
		    }
		}
		for (j = st_j; j < tj; j++) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = p[-8]; c2 = p[-7];
		    c3 = p[-6]; c4 = p[-5];
		    c5 = p[-4]; c6 = p[-3];
		    c7 = p[-2];
		    c8 = p[-1] * inv_pivot;

		    for (k = tk; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		}

		for (; j < num_data_y; j++) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = p[-8]; c2 = p[-7];
		    c3 = p[-6]; c4 = p[-5];
		    c5 = p[-4]; c6 = p[-3];
		    c7 = p[-2];
		    c8 = p[-1] * inv_pivot;

		    for (k = 0; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		}
#endif /* NO_TILE */

		MP_TIME(time_core += MPI_Wtime() - time_core1);

		save_j = j;

		if (j == num_data_y) {
		    get_piv(cid, i + 8, inv_piv);
		    change_piv(cid, i + 8, st_j);
		}

		new_st_j = st_j;

		for (l = 0; l < 8; l++) {
		    if (procid_y(piv[i + 8 + l], size) == cidy) {
			assert(tolocal_y(cidy, piv[i + 8 + l], size) == l_piv[new_st_j]);
			new_st_j++;
		    }
		}
		broad_piv(cid, size, num_data_x, pA, i + 8, new_st_j, piv_r);

		st_j = new_st_j;
	    }
	    else {
		int j;
		int l;
		double *pyp;
		int tj, tk;

		MP_TIME(time_core1 = MPI_Wtime());

#ifdef NO_TILE
		for (j = st_j, pyp = py; j < num_data_y; j++, pyp += 8) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = pyp[0]; c2 = pyp[1];
		    c3 = pyp[2]; c4 = pyp[3];
		    c5 = pyp[4]; c6 = pyp[5];
		    c7 = pyp[6];
		    c8 = pyp[7] * inv_pivot;

		    for (k = 0; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		}
#else /* NO_TILE */
		for (tj = 0; tj < num_data_y - st_j - TILE_J + 1; tj += TILE_J) {
		    double *py0 = &py[8 * tj];
		    int *l_pivp = &l_piv[st_j + tj];

		    for (tk = 0; tk < num_x + 2 - TILE_K; tk += TILE_K) {
			int j;
			double *py1;
			double *p0, *p1, *p2, *p3;
			double *p4, *p5, *p6, *p7;

			p0 = &pr0[tk]; p1 = &pr1[tk];
			p2 = &pr2[tk]; p3 = &pr3[tk];
			p4 = &pr4[tk]; p5 = &pr5[tk];
			p6 = &pr6[tk]; p7 = &pr7[tk];

			for (j = 0, py1 = py0; j < TILE_J; j++, py1 += 8) {
			    int k;
			    double c1, c2, c3, c4;
			    double c5, c6, c7, c8;
			    double *p = &pA[l_pivp[j]][tmp_s];
			    double *lp = &p[tk];

			    c1 = py1[0]; c2 = py1[1];
			    c3 = py1[2]; c4 = py1[3];
			    c5 = py1[4]; c6 = py1[5];
			    c7 = py1[6];
			    c8 = py1[7] * inv_pivot;

			    for (k = 0; k < TILE_K; k++)
				lp[k] -=
				    c1 * p0[k] + c2 * p1[k] +
				    c3 * p2[k] + c4 * p3[k] +
				    c5 * p4[k] + c6 * p5[k] +
				    c7 * p6[k] + c8 * p7[k];
			}
		    }
		}
		for (j = st_j, pyp = py; j < tj + st_j; j++, pyp += 8) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = pyp[0]; c2 = pyp[1];
		    c3 = pyp[2]; c4 = pyp[3];
		    c5 = pyp[4]; c6 = pyp[5];
		    c7 = pyp[6];
		    c8 = pyp[7] * inv_pivot;

		    for (k = tk; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		}
		for (; j < num_data_y; j++, pyp += 8) {
		    int k;
		    double c1, c2, c3, c4;
		    double c5, c6, c7, c8;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = pyp[0]; c2 = pyp[1];
		    c3 = pyp[2]; c4 = pyp[3];
		    c5 = pyp[4]; c6 = pyp[5];
		    c7 = pyp[6];
		    c8 = pyp[7] * inv_pivot;

		    for (k = 0; k <= num_x; k++)
			p[k] -=
			    c1 * pr0[k] + c2 * pr1[k] +
			    c3 * pr2[k] + c4 * pr3[k] +
			    c5 * pr4[k] + c6 * pr5[k] +
			    c7 * pr6[k] + c8 * pr7[k];
		}
#endif /* NO_TILE */

		MP_TIME(time_core += MPI_Wtime() - time_core1);

		save_j = j;

		if (j == num_data_y) {
		    get_piv(cid, i + 8, inv_piv);
		    change_piv(cid, i + 8, st_j);
		}

		new_st_j = st_j;

		for (l = 0; l < 8; l++) {
		    if (procid_y(piv[i + 8 + l], size) == cidy) {
			assert(tolocal_y(cidy, piv[i + 8 + l], size) == l_piv[new_st_j]);
			new_st_j++;
		    }
		}

		broad_piv(cid, size, num_data_x, pA, i + 8, new_st_j, piv_r);

		st_j = new_st_j;
	    }
	}
    }

    if (i < size - 8) {
	/* epilogue */
	int k;
	register double inv_pivot;
	double *pr0, *pr1, *pr2, *pr3;
	double *pr4, *pr5, *pr6, *pr7; 
	double *py;
	double a1, a2, a3;
	double a4, a5, a6, a7;
	int tmp_s, num_x;
	int active_f = (procid_x(i, size) == cidx);

	if (active_f) {
	    tmp_s = tolocal_x(cidx, i, size) + 8;
	}
	else {
	    tmp_s = g_to_l[i];
	}
	num_x = num_data_x - tmp_s;
	pr0 = piv_r[0]; pr1 = piv_r[1];
	pr2 = piv_r[2]; pr3 = piv_r[3];
	pr4 = piv_r[4]; pr5 = piv_r[5];
	pr6 = piv_r[6]; pr7 = piv_r[7];
	py = piv_r[8];
	a1 = inv_piv[0]; a2 = inv_piv[1];
	a3 = inv_piv[2]; a4 = inv_piv[3];
	a5 = inv_piv[4]; a6 = inv_piv[5];

	/*  update 2nd, 3rd & 4th pivot data  */
	if (pr0 == NULL) {
	    pr0 = get_pivot8_x(cid, i, 0);
	}
	if (pr1 == NULL) {
	    pr1 = get_pivot8_x(cid, i, 1);
	}
	if (pr2 == NULL) {
	    pr2 = get_pivot8_x(cid, i, 2);
	}
	if (pr3 == NULL) {
	    pr3 = get_pivot8_x(cid, i, 3);
	}
	for (k = 0; k <= num_x; k++) {
	    double p = pr0[k];
	    double q = pr1[k] - a1 * p;
	    double r = pr2[k] - a2 * p - a3 * q;
	    double s = pr3[k] - a4 * p - a5 * q - a6 * r;
	    pr1[k] = q;
	    pr2[k] = r;
	    pr3[k] = s;
	}

	a1 = inv_piv[6]; a2 = inv_piv[7];
	a3 = inv_piv[8]; a4 = inv_piv[9];

	/*  update 5th pivot data  */
	if (pr4 == NULL) {
	    pr4 = get_pivot8_x(cid, i, 4);
	}
	for (k = 0; k <= num_x; k++)
	    pr4[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k];

	a1 = inv_piv[10]; a2 = inv_piv[11];
	a3 = inv_piv[12]; a4 = inv_piv[13];
	a5 = inv_piv[14];

	/*  update 6th pivot data  */
	if (pr5 == NULL) {
	    pr5 = get_pivot8_x(cid, i, 5);
	}
	for (k = 0; k <= num_x; k++)
	    pr5[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k] +
		a5 * pr4[k];

	a1 = inv_piv[15]; a2 = inv_piv[16];
	a3 = inv_piv[17]; a4 = inv_piv[18];
	a5 = inv_piv[19]; a6 = inv_piv[20];

	/*  update 7th pivot data  */
	if (pr6 == NULL) {
	    pr6 = get_pivot8_x(cid, i, 6);
	}
	for (k = 0; k <= num_x; k++)
	    pr6[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k] +
		a5 * pr4[k] + a6 * pr5[k];

	a1 = inv_piv[21]; a2 = inv_piv[22];
	a3 = inv_piv[23]; a4 = inv_piv[24];
	a5 = inv_piv[25]; a6 = inv_piv[26];
	a7 = inv_piv[27];
	inv_pivot = inv_piv[28];

	/*  update 8th pivot data  */
	if (pr7 == NULL) {
	    pr7 = get_pivot8_x(cid, i, 7);
	}
	for (k = 0; k <= num_x; k++)
	    pr7[k] -=
		a1 * pr0[k] + a2 * pr1[k] +
		a3 * pr2[k] + a4 * pr3[k] +
		a5 * pr4[k] + a6 * pr5[k] +
		a7 * pr6[k];

	if (py == NULL) {
	    py = get_pivot8_y(cid, i);
	}

	MP_TIME(time_core1 = MPI_Wtime());

	if (active_f) {
	    int j;
	    for (j = st_j; j < num_data_y; j++) {
		int k;
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = p[-8]; c2 = p[-7];
		c3 = p[-6]; c4 = p[-5];
		c5 = p[-4]; c6 = p[-3];
		c7 = p[-2];
		c8 = p[-1] * inv_pivot;
		for (k = 0; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
	}
	else {
	    int j;
	    double *pyp = py;
	    for (j = st_j; j < num_data_y; j++, pyp += 8) {
		int k;
		double c1, c2, c3, c4;
		double c5, c6, c7, c8;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = pyp[0]; c2 = pyp[1];
		c3 = pyp[2]; c4 = pyp[3];
		c5 = pyp[4]; c6 = pyp[5];
		c7 = pyp[6];
		c8 = pyp[7] * inv_pivot;
		for (k = 0; k <= num_x; k++)
		    p[k] -=
			c1 * pr0[k] + c2 * pr1[k] +
			c3 * pr2[k] + c4 * pr3[k] +
			c5 * pr4[k] + c6 * pr5[k] +
			c7 * pr6[k] + c8 * pr7[k];
	    }
	}

	MP_TIME(time_core += MPI_Wtime() - time_core1);

	i += 8;
    }

#endif /* SOFTWARE_PIPE */

#else
    i = 0;
#endif

#ifdef NO_SOFTWARE_PIPE

    for (; i < size - 1; i++) {
	register double inv_pivot;
	double *pr0;
	double *py;
	int local_ix = tolocal_x(cidx, i, size);
	int tmp_s, num_x;
	int active_f = (procid_x(i, size) == cidx);

	if (active_f) {

	    MP_TIME(time_piv1 = MPI_Wtime());
	    p_pivot_y(cid, size, pA, i, st_j, NULL);
	    MP_TIME(time_piv += MPI_Wtime() - time_piv1);

	    broad_piv_i_1(cid, i);
	}
	else {
	    get_piv_1(cid, i, st_j, NULL);
	}
	if (procid_y(piv[i], size) == cidy) {
	    assert(tolocal_y(cidy, piv[i], size) == l_piv[st_j]);
	    st_j++;
	}

	broad_piv_1(cid, size, num_data_x, pA, i, st_j, piv_r);

	if (active_f) {
	    tmp_s = local_ix + 1;
	}
	else {
	    tmp_s = g_to_l[i];
	}
	num_x = num_data_x - tmp_s;

	pr0 = piv_r[0];
	py = piv_r[1];
	inv_pivot = inverse_piv[i];
	if (pr0 == NULL) {
	    pr0 = get_pivot1_x(cid, i);
	}
	if (py == NULL) {
	    py = get_pivot1_y(cid, i);
	}

	MP_TIME(time_core1 = MPI_Wtime());

	if (active_f) {
	    int j;
	    for (j = st_j; j < num_data_y; j++) {
		int k;
		double c1;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = p[-1] * inv_pivot;
		/* p[-1] = c1; // for LU decomposition */
		for (k = 0; k <= num_x; k++)
		    p[k] -= c1 * pr0[k];
	    }
	}
	else {
	    int j;
	    for (j = st_j; j < num_data_y; j++) {
		int k;
		double c1;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = py[j - st_j] * inv_pivot;
		for (k = 0; k <= num_x; k++)
		    p[k] -= c1 * pr0[k];
	    }
	}

	MP_TIME(time_core += MPI_Wtime() - time_core1);

	/** MPI_Barrier(MPI_COMM_WORLD); **/

    }

#else /* SOFTWARE_PIPE */

    if (i < size - 1) {
	int active_f = (procid_x(i, size) == cidx);
	if (active_f) {

	    MP_TIME(time_piv1 = MPI_Wtime());
	    p_pivot_y(cid, size, pA, i, st_j, NULL);
	    MP_TIME(time_piv += MPI_Wtime() - time_piv1);

	    broad_piv_i_1(cid, i);
	}
	else {
	    get_piv_1(cid, i, st_j, NULL);
	}
	if (procid_y(piv[i], size) == cidy) {
	    assert(tolocal_y(cidy, piv[i], size) == l_piv[st_j]);
	    st_j++;
	}
	broad_piv_1(cid, size, num_data_x, pA, i, st_j, piv_r);
    }

    for (; i < size - 2; i++) {
	register double inv_pivot;
	double *pr0;
	double *py;
	int local_ix = tolocal_x(cidx, i, size);
	int tmp_s, num_x;
	int active_f = (procid_x(i, size) == cidx);
	int next_active_f = (procid_x(i + 1, size) == cidx);

	if (active_f) {
	    tmp_s = local_ix + 1;
	}
	else {
	    tmp_s = g_to_l[i];
	}
	num_x = num_data_x - tmp_s;

	pr0 = piv_r[0];
	py = piv_r[1];
	inv_pivot = inverse_piv[i];
	if (pr0 == NULL) {
	    pr0 = get_pivot1_x(cid, i);
	}
	if (py == NULL) {
	    py = get_pivot1_y(cid, i);
	}
	if (next_active_f) {
	    if (active_f) {
		int j;
		int piv_i;
		for (j = st_j; j < num_data_y; j++) {
		    double c1;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = p[-1] * inv_pivot;
		    /* p[-1] = c1; // for LU decomposition */
		    p[0] -= c1 * pr0[0];
		}

		MP_TIME(time_piv1 = MPI_Wtime());
		p_pivot_y(cid, size, pA, i + 1, st_j, py);
		MP_TIME(time_piv += MPI_Wtime() - time_piv1);

		broad_piv_i_1(cid, i + 1);
		piv_i = piv[i + 1];

		if (procid_y(piv_i, size) == cidy) {
		    int k;
		    double c1;
		    int local_i = tolocal_y(cidy, piv_i, size);
		    double *p = &pA[local_i][tmp_s];

		    assert(py[0] == p[-1]);
		    c1 = p[-1] * inv_pivot;
		    /* p[-1] = c1; // for LU decomposition */
		    for (k = 1; k <= num_x; k++)
			p[k] -= c1 * pr0[k];

		    assert(local_i == l_piv[st_j]);
		    st_j++;	/* st_j is incremented here */
		}
		broad_piv_1(cid, size, num_data_x, pA, i + 1, st_j, piv_r);

		MP_TIME(time_core1 = MPI_Wtime());

		for (j = st_j; j < num_data_y; j++) {
		    int k;
		    double c1;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = p[-1] * inv_pivot;
		    /* p[-1] = c1; // for LU decomposition */
		    for (k = 1; k <= num_x; k++)
			p[k] -= c1 * pr0[k];
		}

		MP_TIME(time_core += MPI_Wtime() - time_core1);

	    }
	    else {
		int j;
		int piv_i;

		for (j = st_j; j < num_data_y; j++) {
		    double c1;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = py[j - st_j] * inv_pivot;
		    p[0] -= c1 * pr0[0];
		}

		MP_TIME(time_piv1 = MPI_Wtime());
		p_pivot_y(cid, size, pA, i + 1, st_j, py);
		MP_TIME(time_piv += MPI_Wtime() - time_piv1);

		broad_piv_i_1(cid, i + 1);
		piv_i = piv[i + 1];

		j = st_j;
		if (procid_y(piv_i, size) == cidy) {
		    int k;
		    double c1;
		    int local_i = tolocal_y(cidy, piv_i, size);
		    double *p = &pA[local_i][tmp_s];

		    c1 = py[0] * inv_pivot;
		    for (k = 1; k <= num_x; k++)
			p[k] -= c1 * pr0[k];

		    j++;
		}
		broad_piv_1(cid, size, num_data_x, pA, i + 1, j, piv_r);

		MP_TIME(time_core1 = MPI_Wtime());

		for (; j < num_data_y; j++) {
		    int k;
		    double c1;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = py[j - st_j] * inv_pivot;
		    for (k = 1; k <= num_x; k++)
			p[k] -= c1 * pr0[k];
		}

		MP_TIME(time_core += MPI_Wtime() - time_core1);

		if (procid_y(piv_i, size) == cidy) {
		    st_j++;	/* st_j is incremented here */
		}
	    }
	}
	else {

	    MP_TIME(time_core1 = MPI_Wtime());

	    if (active_f) {
		int j;
		for (j = st_j; j < num_data_y; j++) {
		    int k;
		    double c1;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = p[-1] * inv_pivot;
		    /* p[-1] = c1; // for LU decomposition */
		    for (k = 0; k <= num_x; k++)
			p[k] -= c1 * pr0[k];
		}
	    }
	    else {
		int j;
		for (j = st_j; j < num_data_y; j++) {
		    int k;
		    double c1;
		    double *p = &pA[l_piv[j]][tmp_s];

		    c1 = py[j - st_j] * inv_pivot;
		    for (k = 0; k <= num_x; k++)
			p[k] -= c1 * pr0[k];
		}
	    }

	    MP_TIME(time_core += MPI_Wtime() - time_core1);

	    get_piv_1(cid, i + 1, st_j, py);
	    if (procid_y(piv[i + 1], size) == cidy) {
		assert(tolocal_y(cidy, piv[i + 1], size) == l_piv[st_j]);
		st_j++;
	    }
	    broad_piv_1(cid, size, num_data_x, pA, i + 1, st_j, piv_r);
	}
    }

    if (i < size - 1) {
	register double inv_pivot;
	double *pr0;
	double *py;
	int local_ix = tolocal_x(cidx, i, size);
	int tmp_s, num_x;
	int active_f = (procid_x(i, size) == cidx);

	if (active_f) {
	    tmp_s = local_ix + 1;
	}
	else {
	    tmp_s = g_to_l[i];
	}
	num_x = num_data_x - tmp_s;

	pr0 = piv_r[0];
	py = piv_r[1];
	inv_pivot = inverse_piv[i];
	if (pr0 == NULL) {
	    pr0 = get_pivot1_x(cid, i);
	}
	if (py == NULL) {
	    py = get_pivot1_y(cid, i);
	}

	MP_TIME(time_core1 = MPI_Wtime());

	if (active_f) {
	    int j;
	    for (j = st_j; j < num_data_y; j++) {
		int k;
		double c1;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = p[-1] * inv_pivot;
		/* p[-1] = c1; // for LU decomposition */
		for (k = 0; k <= num_x; k++)
		    p[k] -= c1 * pr0[k];
	    }
	}
	else {
	    int j;
	    for (j = st_j; j < num_data_y; j++) {
		int k;
		double c1;
		double *p = &pA[l_piv[j]][tmp_s];

		c1 = py[j - st_j] * inv_pivot;
		for (k = 0; k <= num_x; k++)
		    p[k] -= c1 * pr0[k];
	    }
	}

	MP_TIME(time_core += MPI_Wtime() - time_core1);

	i++;
    }

#endif /* SOFTWARE_PIPE */

    if (st_j < num_data_y) {
	int piv_j = l_piv[st_j];
	int global_j = toglobal_y(cidy, piv_j, size);
	int local_ix = tolocal_x(cidx, i, size);

	piv[i] = global_j;
	broad_send_int_y(cid, BROAD_PIV_LAST, &piv[i]);

	if (procid_x(i, size) == cidx) {
	    assert(pA[piv_j][local_ix] != 0.0);
	    inverse_piv[i] = 1 / pA[piv_j][local_ix];
	    broad_send_double(cid, BROAD_INV_PIV, &inverse_piv[i]);
	}
	else {
	    broad_recv_double(cid, BROAD_INV_PIV, &inverse_piv[i]);
	}
    }
    else {
	broad_recv_int_y(cid, BROAD_PIV_LAST, &piv[i]);
	broad_recv_double(cid, BROAD_INV_PIV, &inverse_piv[i]);
    }
}

void
init_LU(cid, size)
    int cid, size;
{
    int i;
    int c = 0, d = -1;
    int cidx, cidy;
    int size_y;

    lin_trec(cid, cidx, cidy);
    size_y = num_in_proc_y(cidy, size);

    for (i = 0; i < size_y; i++)
	l_piv[i] = i;

    for (i = 0; i < size; i++) {
	g_to_l[i] = c;
	if (procid_x(i, size) == cidx) {
	    c++;
	    d++;
	}
	g_to_l_e[i] = d;
    }
}

/*
 *  Solve an upper triangular matrix eliminated by 'gauss()'
 */

void
solve_U(cid, size, pA, x)
    int cid;
    int size;
    dot_mat pA;
    double *x;
{
    int i, j;
    int cidx, cidy;
    int size_x;
    int num_recv = size - 1;

    lin_trec(cid, cidx, cidy);
    size_x = num_in_proc_x(cidx, size);

    for (i = size - 1; i >= 0; i--) {
	int piv_i = piv[i];
	if (procid_y(piv_i, size) == cidy) {
	    int local_i = tolocal_y(cidy, piv_i, size);
	    double s = 0;
	    double t;
	    int in;

	    for (j = size_x - 1; j > g_to_l_e[num_recv]; j--) {
		int global_j = toglobal_x(cidx, j, size);
		s += pA[local_i][j] * x[global_j];
	    }


#if defined(__AP__) && defined(__BROAD_PUT__)
	    in = toglobal_x(cidx, g_to_l_e[i] + 1, size) - 1;
	    if (num_recv - in > 0) {
		local_f += num_recv - in;
		amcheck(&put_f, local_f);
		num_recv = in;
	    }
#else /* __MPI__ */
	    for (; num_recv > i; num_recv--) {
		broad_recv_double_y(cid, BROAD_ANS_B + num_recv * SKIP,
				    &x[num_recv]);
	    }
#endif

	    for (; j > g_to_l_e[i]; j--) {
		int global_j = toglobal_x(cidx, j, size);
		s += pA[local_i][j] * x[global_j];
	    }
	    dsum_x(cid, s, &t);
	    x[i] = (pA[local_i][size_x] - t) * inverse_piv[i];
#if defined(__AP__) && defined(__BROAD_PUT__)
	    y_broad_put(&x[i], 2, &x[i], &put_f, &dma_f);
#else /* __MPI__ */
	    broad_send_double_y(cid, BROAD_ANS_B + i * SKIP, &x[i]);
#endif
	    num_recv--;
	}
    }

    /* receive the rest message */
#if defined(__AP__) && defined(__BROAD_PUT__)
    local_f += num_recv + 1;
    amcheck(&put_f, local_f);
#else /* __MPI__ */
    for (; num_recv >= 0; num_recv--) {
	broad_recv_double_y(cid, BROAD_ANS_B + num_recv * SKIP, &x[num_recv]);
    }
#endif
}


void
solve_U_block(cid, size, pA, x)
    int cid;
    int size;
    mat pA;
    double *x;
{
    int i, j;
    int num_recv = size - 1;

    for (i = size - 1; i >= 0; i--) {
	int piv_i = piv[i];
	if (procid_block(piv_i, size) == cid) {
	    int local_i = tolocal_block(cid, piv_i, size);
	    double s = 0;
	    for (j = size - 1; j > num_recv; j--) {
		s += pA[local_i][j] * x[j];
	    }
	    for (; num_recv > i; num_recv--) {
		broad_recv_double(cid, BROAD_ANS_B + num_recv * SKIP, &x[num_recv]);
	    }

	    for (; j > i; j--) {
		s += pA[local_i][j] * x[j];
	    }
	    x[i] = (pA[local_i][size] - s) * inverse_piv[i];
	    broad_send_double(cid, BROAD_ANS_B + i * SKIP, &x[i]);
	    num_recv--;
	}
    }

    /* receive the rest message */
    for (; num_recv >= 0; num_recv--) {
	broad_recv_double(cid, BROAD_ANS_B + num_recv * SKIP, &x[num_recv]);
    }
}


/*
 *  Solve Upper Triangular Matrix.
 *
 *  Using Outer Product
 *
 */

void
solve_U_outer(cid, size, pA, x)
    int cid;
    int size;
    dot_mat pA;
    double *x;
{
    int i, j;
    int cidx, cidy;
    int size_x;

    lin_trec(cid, cidx, cidy);
    size_x = num_in_proc_x(cidx, size);

    bzero(x, size * sizeof(double));

    for (i = size - 1; i >= 0; i--) {
	int piv_i = piv[i];
	int local_i = tolocal_x(cidy, i, size);

	if (procid_y(piv_i, size) == cidy) {
	    double t;
	    dsum_x(cid, x[i], &t);
	    x[i] = (pA[local_i][size_x] - t) * inverse_piv[i];
#if defined(__AP__) && defined(__BROAD_PUT__)
	    y_broad_put(&x[i], 2, &x[i], &put_f, &dma_f);
#else /* __MPI__ */
	    broad_send_double_y(cid, BROAD_ANS_B + i * SKIP, &x[i]);
#endif

	    for (j = 0; j < g_to_l[i] - 1; ++j) {
		int global_j = toglobal_x(cidx, j, size);
		int piv_j = piv[global_j];
		int l_piv_j = l_piv[j];
		x[piv_j] += pA[local_i][l_piv_j] * x[piv_j];
	    }
	}
	else {
#if defined(__AP__) && defined(__BROAD_PUT__)
	    ++local_f;
	    amcheck(&put_f, local_f);
#else /* __MPI__ */
	    broad_recv_double_y(cid, BROAD_ANS_B + i * SKIP,
				&x[i]);
#endif
	    for (j = 0; j < g_to_l[i]; ++j) {
		int global_j = toglobal_x(cidx, j, size);
		int piv_j = piv[global_j];
		int l_piv_j = l_piv[j];
		x[piv_j] += pA[local_i][l_piv_j] * x[piv_j];
	    }
	}
    }
}
