#include #include #include #include #include "npb-C.h" #define MAX_TIMERS 2 #include "upc_timers.h" #include "globals.h" #define TIMER_BENCH 0 #define TIMER_INIT 1 /* global variables */ shared double timer[MAX_TIMERS][THREADS]; shared int jg[4][MM][2]; shared double red_best, red_winner; shared double sh_a[4], sh_c[4]; shared int sh_nit, sh_nx, sh_ny, sh_nz, sh_debug_vec[8], sh_lt, sh_lb; shared double s, max; int is1, is2, is3, ie1, ie2, ie3; upc_lock_t *critical_lock; void allocate_memory(int n2, int n3); void setup(int *n1, int *n2, int *n3, int lt); double power( double a, int n ); void bubble( double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind ); int nx[MAXLEVEL+1], ny[MAXLEVEL+1], nz[MAXLEVEL+1]; char Class; int debug_vec[8]; int lt, lb; int ir[MAXLEVEL]; int m1[MAXLEVEL+1], m2[MAXLEVEL+1], m3[MAXLEVEL+1]; int nbr[4][2][MAXLEVEL+1]; int dead[MAXLEVEL+1], give_ex[4][MAXLEVEL+1], take_ex[4][MAXLEVEL+1]; shared sh_arr_t sh_u[THREADS*(LT_DEFAULT_I)], sh_r[THREADS*(LT_DEFAULT_I)]; shared sh_arr_t sh_v[THREADS]; void zero3( shared sh_arr_t *z, int n1, int n2, int n3); void zran3( shared sh_arr_t *z, int n1, int n2, int n3, int nx, int ny, int k); void comm3( shared sh_arr_t *u, int n1, int n2, int n3, int kk); void norm2u3(shared sh_arr_t *r, int n1, int n2, int n3, double *rnm2, double *rnmu, int nx, int ny, int nz); void resid( shared sh_arr_t *u, shared sh_arr_t *v, shared sh_arr_t *r, int n1, int n2, int n3, double a[4], int k ); void rep_nrm( shared sh_arr_t *u, int n1, int n2, int n3, const char *title, int kk); void showall( shared sh_arr_t *z, int n1, int n2, int n3); void mg3P(shared sh_arr_t *u, shared sh_arr_t *v, shared sh_arr_t *r, double a[4], double c[4], int n1, int n2, int n3, int k); void rprj3( shared sh_arr_t *r, int m1k, int m2k, int m3k, shared sh_arr_t *s, int m1j, int m2j, int m3j, int k ); void psinv( shared sh_arr_t *r, shared sh_arr_t *u, int n1, int n2, int n3, double c[4], int k); void interp( shared sh_arr_t *z, int mm1, int mm2, int mm3, shared sh_arr_t *u, int n1, int n2, int n3, int k ); #include "file_output.h" #define FILE_DEBUG FALSE int main(int argc, char *argv[]){ /* k is the current level. It is passed down through subroutine args and is NOT global. it is the current iteration */ int it; double t, tinit, mflops; double a[4], c[4]; double rnm2; double rnmu; /* These arrays are in common because they are quite large and probably shouldn't be allocated on the stack. They are always passed as subroutine args. */ double epsilon = 1.0e-8; int n1, n2, n3, nit; double verify_value; boolean verified; int i, l; int nn; FILE *fp; double max_timer[MAX_TIMERS]; // UPC Specific initializations critical_lock = upc_all_lock_alloc(); MEM_OK( critical_lock ); // Parameters timer_clear(TIMER_BENCH); timer_clear(TIMER_INIT); timer_start(TIMER_INIT); if( MYTHREAD == 0 ) { /* Read in and broadcast input data */ printf("\n\n NAS Parallel Benchmarks 2.4 UPC version" " - MG Benchmark - GWU/HPCL\n\n"); fp = fopen("mg.input", "r"); if (fp != NULL) { printf(" Reading from input file mg.input\n"); fscanf(fp, "%d", <); while(fgetc(fp) != '\n'); fscanf(fp, "%d%d%d", &nx[lt], &ny[lt], &nz[lt]); while(fgetc(fp) != '\n'); fscanf(fp, "%d", &nit); while(fgetc(fp) != '\n'); for (i = 0; i <= 7; i++) { fscanf(fp, "%d", &debug_vec[i]); } fclose(fp); } else { printf(" No input file. Using compiled defaults\n"); lt = LT_DEFAULT; nit = NIT_DEFAULT; nx[lt] = NX_DEFAULT; ny[lt] = NY_DEFAULT; nz[lt] = NZ_DEFAULT; for (i = 0; i <= 7; i++) debug_vec[i] = DEBUG_DEFAULT; } sh_nx=nx[lt]; sh_ny=ny[lt]; sh_nz=nz[lt]; sh_lt=lt; sh_nit=nit; for (i = 0; i <= 7; i++) sh_debug_vec[i]=debug_vec[i]; if ( (nx[lt] != ny[lt]) || (nx[lt] != nz[lt]) ) Class = 'U'; else if( nx[lt] == 32 && nit == 4 ) Class = 'S'; else if( nx[lt] == 64 && nit == 40 ) Class = 'W'; else if( nx[lt] == 256 && nit == 4 ) Class = 'A'; else if( nx[lt] == 256 && nit == 20 ) Class = 'B'; else if( nx[lt] == 512 && nit == 20 ) Class = 'C'; else if( nx[lt] == 1024 && nit == 50 ) Class = 'D'; else Class = 'U'; /* Use these for debug info: debug_vec(0) = 1 !=> report all norms debug_vec(1) = 1 !=> some setup information debug_vec(1) = 2 !=> more setup information debug_vec(2) = k => at level k or below, show result of resid debug_vec(3) = k => at level k or below, show result of psinv debug_vec(4) = k => at level k or below, show result of rprj debug_vec(5) = k => at level k or below, show result of interp debug_vec(6) = 1 => (unused) debug_vec(7) = 1 => (unused) */ sh_a[0] = -8.0/3.0; sh_a[1] = 0.0; sh_a[2] = 1.0/6.0; sh_a[3] = 1.0/12.0; if (Class == 'A' || Class == 'S' || Class =='W') { /* Coefficients for the S(a) smoother */ sh_c[0] = -3.0/8.0; sh_c[1] = 1.0/32.0; sh_c[2] = -1.0/64.0; sh_c[3] = 0.0; } else { /* Coefficients for the S(b) smoother */ sh_c[0] = -3.0/17.0; sh_c[1] = 1.0/33.0; sh_c[2] = -1.0/61.0; sh_c[3] = 0.0; } sh_lb = 1; } upc_barrier 1; lt=sh_lt; nit=sh_nit; lb=sh_lb; nx[lt]=sh_nx;ny[lt]=sh_ny;nz[lt]=sh_nz; for (i = 0; i <= 7; i++) debug_vec[i]=sh_debug_vec[i]; a[0]=sh_a[0]; a[1]=sh_a[1]; a[2]=sh_a[2]; a[3]=sh_a[3]; c[0]=sh_c[0]; c[1]=sh_c[1]; c[2]=sh_c[2]; c[3]=sh_c[3]; setup(&n1,&n2,&n3,lt); for( l=lt; l>=1; l-- ){ sh_u[MYTHREAD+(l*THREADS)].arr = (shared [] double *) upc_alloc((m3[l]*m2[l]*m1[l]) * sizeof(double)); MEM_OK( sh_u[MYTHREAD+(l*THREADS)].arr ); sh_r[MYTHREAD+(l*THREADS)].arr = (shared [] double *) upc_alloc((m3[l]*m2[l]*m1[l]) * sizeof(double)); MEM_OK( sh_r[MYTHREAD+(l*THREADS)].arr ); } sh_v[MYTHREAD].arr = (shared [] double *) upc_alloc((m3[lt]*m2[lt]*m1[lt]) * sizeof(double)); MEM_OK( sh_v[MYTHREAD].arr ); allocate_memory(n2, n3); zero3( &sh_u[lt*THREADS], n1, n2, n3); zran3( sh_v, n1, n2, n3, nx[lt], ny[lt], lt); norm2u3( sh_v, n1, n2, n3, &rnm2, &rnmu, nx[lt], ny[lt], nz[lt]); if (MYTHREAD == 0) { printf(" Size: %3dx%3dx%3d (class %1c)\n", nx[lt], ny[lt], nz[lt], Class); printf(" Iterations: %3d\n", nit); } resid( &sh_u[lt*THREADS], sh_v, &sh_r[lt*THREADS], n1, n2, n3, a, lt); norm2u3( &sh_r[lt*THREADS], n1, n2, n3, &rnm2, &rnmu, nx[lt], ny[lt], nz[lt]); /* One iteration for startup */ mg3P( sh_u, sh_v, sh_r, a, c, n1, n2, n3, lt); resid( &sh_u[lt*THREADS], sh_v, &sh_r[lt*THREADS], n1, n2, n3, a, lt); setup(&n1,&n2,&n3,lt); zero3( &sh_u[lt*THREADS], n1, n2, n3 ); zran3( sh_v, n1, n2, n3, nx[lt], ny[lt], lt ); upc_barrier 17; timer_stop(TIMER_INIT); timer_start(TIMER_BENCH); resid( &sh_u[lt*THREADS], sh_v, &sh_r[lt*THREADS], n1, n2, n3, a, lt); norm2u3( &sh_r[lt*THREADS], n1, n2, n3, &rnm2, &rnmu, nx[lt], ny[lt], nz[lt] ); for ( it = 1; it <= nit; it++){ if( MYTHREAD == 0 ) printf(" %4d\n", it ); mg3P( sh_u, sh_v, sh_r, a, c, n1, n2, n3, lt ); resid( &sh_u[lt*THREADS], sh_v, &sh_r[lt*THREADS], n1, n2, n3, a, lt ); } norm2u3( &sh_r[lt*THREADS], n1, n2, n3, &rnm2, &rnmu, nx[lt], ny[lt], nz[lt] ); timer_stop(TIMER_BENCH); /* prepare for timers_reduce() */ for( i=0; i0; k--) ng[ax][k] = ng[ax][k+1]/2; } for(k=lt-1; k>0; k--) { nx[k] = ng[1][k]; ny[k] = ng[2][k]; nz[k] = ng[3][k]; } log_p = log(((float)THREADS)+0.0001)/log(2.0); dx = log_p/3; pi[1] = power(2,dx); idi[1] = MYTHREAD % pi[1]; dy = (log_p-dx)/2; pi[2] = power(2,dy); idi[2] = (MYTHREAD/pi[1]) % pi[2]; pi[3] = THREADS/(pi[1]*pi[2]); idi[3] = MYTHREAD / (pi[1]*pi[2]); for(k=lt; k>0; k--) { dead[k] = 0; for(ax=1; ax<4; ax++) { take_ex[ax][k] = 0; give_ex[ax][k] = 0; mi[ax][k] = 2 + ((idi[ax]+1)*ng[ax][k])/pi[ax] - ((idi[ax]+0)*ng[ax][k])/pi[ax]; mip[ax][k] = 2 + ((next[ax]+idi[ax]+1)*ng[ax][k])/pi[ax] - ((next[ax]+idi[ax]+0)*ng[ax][k])/pi[ax]; if (mip[ax][k]==2 || mi[ax][k]==2) next[ax] = 2*next[ax]; if( k+1 < lt ) { if ((mip[ax][k]==2) && (mi[ax][k]==3)) { give_ex[ax][k+1] = 1; } if ((mip[ax][k]==3)&&(mi[ax][k]==2)) { take_ex[ax][k+1] = 1; } } } if( mi[1][k]==2 || mi[2][k]==2 || mi[3][k]==2 ) dead[k] = 1; m1[k] = mi[1][k]; m2[k] = mi[2][k]; m3[k] = mi[3][k]; for (ax=1; ax<4; ax++) { idin[ax][DIRm1] = (idi[ax]+next[ax]+pi[ax]) % pi[ax]; idin[ax][DIRp1] = (idi[ax]-next[ax]+pi[ax]) % pi[ax]; } for (dir=0; dir<2; dir++) { nbr[1][dir][k] = idin[1][dir] + pi[1] * (idi[2] + pi[2] * idi[3]); nbr[2][dir][k] = idi[1] + pi[1] * (idin[2][dir] + pi[2] * idi[3]); nbr[3][dir][k] = idi[1] + pi[1] * (idi[2] + pi[2] * idin[3][dir]); } } k=lt; is1 = 1 + ng[1][k] - ((pi[1] -idi[1])*ng[1][lt])/pi[1]; ie1 = 0 + ng[1][k] - ((pi[1]-1-idi[1])*ng[1][lt])/pi[1]; *n1 = 3 + ie1 - is1; is2 = 1 + ng[2][k] - ((pi[2] -idi[2])*ng[2][lt])/pi[2]; ie2 = 0 + ng[2][k] - ((pi[2]-1-idi[2])*ng[2][lt])/pi[2]; *n2 = 3 + ie2 - is2; is3 = 1 + ng[3][k] - ((pi[3] -idi[3])*ng[3][lt])/pi[3]; ie3 = 0 + ng[3][k] - ((pi[3]-1-idi[3])*ng[3][lt])/pi[3]; *n3 = 3 + ie3 - is3; ir[lt]=1; for(j = lt-1 ; j>0; j--) ir[j]=ir[j+1] + m1[j+1]*m2[j+1]*m3[j+1]; if (debug_vec[1] >= 1 ) { if (MYTHREAD == 0){ printf(" in setup, \n"); printf(" P# lt nx ny nz n1 n2 n3 is1 is2 is3 ie1 ie2 ie3\n"); } upc_barrier 67; printf("%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d%4d\n", MYTHREAD,lt,nx[lt],ny[lt],nz[lt],*n1,*n2,*n3,is1,is2,is3,ie1,ie2,ie3); } if (debug_vec[1] >= 2 ) { for(i=0; i0; k--) { printf("%didi=%4d%4d%4d", k,idi[1], idi[2], idi[3]); printf("nbr=%4d%4d %4d%4d %4d%4d ", nbr[1][DIRm1][k], nbr[1][DIRp1][k], nbr[2][DIRm1][k], nbr[2][DIRp1][k], nbr[3][DIRm1][k], nbr[3][DIRp1][k]); printf("mi=%4d%4d%4d\n", mi[1][k],mi[2][k],mi[3][k]); } printf("idi(s) = %10d%10d%10d\n", idi[1],idi[2],idi[3]); printf("dead(2), dead(1) = %2d%2d\n", dead[2], dead[1]); for(ax=1; ax<4; ax++) { printf("give_ex[%d][2]= %d", ax, give_ex[ax][2]); printf(" take_ex[%d][2]= %d\n", ax, take_ex[ax][2]); } } upc_barrier 68; } } k = lt; } void mg3P(shared sh_arr_t *u, shared sh_arr_t *v, shared sh_arr_t *r, double a[4], double c[4], int n1, int n2, int n3, int k){ /* multigrid V-cycle routine */ int j; /* down cycle. restrict the residual from the find grid to the coarse */ for (k = lt; k >= lb+1; k--){ j = k-1; rprj3( &r[k*THREADS], m1[k], m2[k], m3[k], &r[j*THREADS], m1[j], m2[j], m3[j], k); } k = lb; /* compute an approximate solution on the coarsest grid */ zero3( &u[k*THREADS], m1[k], m2[k], m3[k]); psinv( &r[k*THREADS], &u[k*THREADS], m1[k], m2[k], m3[k], c, k); for (k = lb+1; k <= lt-1; k++){ j = k-1; /* prolongate from level k-1 to k */ zero3( &u[k*THREADS], m1[k], m2[k], m3[k]); interp( &u[j*THREADS], m1[j], m2[j], m3[j], &u[k*THREADS], m1[k], m2[k], m3[k], k); /* compute residual for level k */ resid( &u[k*THREADS], &r[k*THREADS], &r[k*THREADS], m1[k], m2[k], m3[k], a, k); /* apply smoother */ psinv( &r[k*THREADS], &u[k*THREADS], m1[k], m2[k], m3[k], c, k); } j = lt - 1; k = lt; interp( &u[j*THREADS], m1[j], m2[j], m3[j], &u[lt*THREADS], n1, n2, n3, k); resid( &u[lt*THREADS], v, &r[lt*THREADS], n1, n2, n3, a, k); psinv( &r[lt*THREADS], &u[lt*THREADS], n1, n2, n3, c, k); } double power( double a, int n ){ /* power raises an integer, disguised as a double precision real, to an integer power */ double aj; int nj; double rdummy; double power; power = 1.0; nj = n; aj = a; while (nj != 0){ if( (nj%2) == 1 ) rdummy = randlc( &power, aj ); rdummy = randlc( &aj, aj ); nj = nj/2; } return power; } void bubble( double ten[M][2], int j1[M][2], int j2[M][2], int j3[M][2], int m, int ind ){ /* bubble does a bubble sort in direction dir */ double temp; int i, j_temp; if ( ind == 1 ) { for (i = 0; i < m-1; i++) { if ( ten[i][ind] > ten[i+1][ind] ) { temp = ten[i+1][ind]; ten[i+1][ind] = ten[i][ind]; ten[i][ind] = temp; j_temp = j1[i+1][ind]; j1[i+1][ind] = j1[i][ind]; j1[i][ind] = j_temp; j_temp = j2[i+1][ind]; j2[i+1][ind] = j2[i][ind]; j2[i][ind] = j_temp; j_temp = j3[i+1][ind]; j3[i+1][ind] = j3[i][ind]; j3[i][ind] = j_temp; } else return; } } else { for (i = 0; i < m-1; i++) { if ( ten[i][ind] < ten[i+1][ind] ) { temp = ten[i+1][ind]; ten[i+1][ind] = ten[i][ind]; ten[i][ind] = temp; j_temp = j1[i+1][ind]; j1[i+1][ind] = j1[i][ind]; j1[i][ind] = j_temp; j_temp = j2[i+1][ind]; j2[i+1][ind] = j2[i][ind]; j2[i][ind] = j_temp; j_temp = j3[i+1][ind]; j3[i+1][ind] = j3[i][ind]; j3[i][ind] = j_temp; } else return; } } }