/*-------------------------------------------------------------------- NAS Parallel Benchmarks 2.3 UPC versions - CG This version of UPC CG was developed by the high performance computing group in the Computer Science Department in the University of North Carolina. It was derivated from the Fortran versions in "NPB 2.3-serial" developed by NAS and the Openmp ersion developed by RWCP. We hereby grant the permission to use, copy, distribute and modify this software for any purpose with/without fee. This software is provided in "as is" fashion. No explicit or implied warranty may be derived. Please send comments on the UPC versions to prins@cs.unc.edu For RWCP OpenMp implementation, refer to website at: http://pdplab.trc.rwcp.or.jp/pdperf/Omni/ NASA's NPB benchmark is on line at: http://www.nas.nasa.gov/NAS/NPB/ --------------------------------------------------------------------*/ /*-------------------------------------------------------------------- Authors: Luke Huan Jeff Feasel Jan Prins -------------------------------------------------------------------*/ #include "upc_relaxed.h" #include "upc.h" #include "npb-C.h" #include "npbparams.h" #define NZ NA*(NONZER+1)*(NONZER+1)/THREADS+NA*(NONZER+2) #define BLOCK_V 1000 #define BLOCK_S 1000 #define TRUE 1 #define FALSE 0 #define DEBUGGING FALSE #define STATISTICS FALSE #define ceil_div(n,d) (((n)+(d)-1)/(d)) /* global variables */ static int naa; static int nzz; static int firstrow; static int lastrow; static int firstcol; static int lastcol; static int index_i; static int index_j; static int log2np_row; struct chunk_ { double d[NA + 2]; }; typedef struct chunk_ chunk; static shared chunk xfer[THREADS]; int np_row, np_column; int rowindex[THREADS+2]; int columnindex[THREADS+2]; /* common /main_int_mem/ */ static int colidx[NZ+1]; /* colidx[1:NZ] */ static int rowstr[NA+1+1]; static int iv[2*NA+1+1]; /* iv[1:2*NA+1] */ static int arow[NZ+1]; /* arow[1:NZ] */ static int acol[NZ+1]; /* acol[1:NZ] */ /* common /main_flt_mem/ */ static double v[NA+1+1]; /* v[1:NA+1] */ static double aelt[NZ+1]; /* aelt[1:NZ] */ /*shared a - sparse matrix*/ static double a[NZ+1]; /* a[1:NZ] */ shared [BLOCK_V]static double sx[NA+2+1]; /* x[1:NA+2] */ shared [BLOCK_V]static double sz[NA+2+1]; /* z[1:NA+2] */ /*template for reduction variable*/ shared static double sharevalary[THREADS]; shared static double shareval; shared [BLOCK_S] static double st[NA+2+1]; shared static double sharedTwoDarray[NA+2+1][THREADS]; /*extra memory working space*/ static int work1[NA+1]; static int work2[NA+1]; /* common /urando/ */ static double amult; static double tran; /* function declarations */ static int debugging(); static int statistics(); static int lpow2(int k); static int need_transpose( int index_i , int index_j, int * target_p); static void vector_reduction(int size[], double working[]); static void reduction_share(double *value_p); static double vector_dot_product(double *x, double *y, int size); static void floatcopyin2(double * dest, shared [BLOCK_V] double * sorc, int size, int block_size); static void floatcopyout2(shared [BLOCK_V] double * dest, double * sorc, int size, int block_size); static void reduction_TwoDarray(double working[]); static void conj_grad (int colidx[], int rowstr[], double a[], double * zeta_p, double *rnorm); static void makea(int n, int nz, double a[], int colidx[], int rowstr[], int nonzer, int firstrow, int lastrow, int firstcol, int lastcol, double rcond, int arow[], int acol[], double aelt[], double v[], int iv[], double shift ); static void sparse(double a[], int colidx[], int rowstr[], int n, int arow[], int acol[], double aelt[], int firstrow, int lastrow, double x[], boolean mark[], int nzloc[], int nnza); static void sprnvc(int n, int nz, double v[], int iv[], int nzloc[], int mark[]); static int icnvrt(double x, int ipwr2); static void vecset(int n, double v[], int iv[], int *nzv, int i, double val); /*-------------------------------------------------------------------- program cg --------------------------------------------------------------------*/ int main(int argc, char **argv) { int i, j, k, it; int nthreads=THREADS; double zeta; double rnorm; double t, mflops, t1; char class; boolean verified; double zeta_verify_value, epsilon; int NITER1, logt; if (NA == 1400 && NONZER == 7 && NITER == 15 && SHIFT == 10.0) { class = 'S'; zeta_verify_value = 8.5971775078648; } else if (NA == 7000 && NONZER == 8 && NITER == 15 && SHIFT == 12.0) { class = 'W'; zeta_verify_value = 10.362595087124; } else if (NA == 14000 && NONZER == 11 && NITER == 15 && SHIFT == 20.0) { class = 'A'; zeta_verify_value = 17.130235054029; } else if (NA == 75000 && NONZER == 13 && NITER == 75 && SHIFT == 60.0) { class = 'B'; zeta_verify_value = 22.712745482631; } else if (NA == 150000 && NONZER == 15 && NITER == 75 && SHIFT == 110.0) { class = 'C'; zeta_verify_value = 28.973605592845; } else { class = 'U'; } //found the np_row and np_column i = 1; logt =0; while( THREADS > i ){ i = 2*i; logt++; } if( THREADS != i){ //not a power of 2 printf("Sorry, the number of threads much be power of 2!\n"); return 1; } if (MYTHREAD == 0 ){ printf("\n\n NAS Parallel Benchmarks 2.3 UPC version" " - CG Benchmark\n"); printf(" Size: %10d\n", NA); printf(" Iterations: %5d\n", NITER); timer_clear( 20 ); timer_clear( 21 ); timer_clear( 22 ); } naa = NA; nzz = NZ; if (logt /2 * 2 == logt){ //even # np_row = (int)(pow(2, logt/2)); np_column = (int)(pow(2, logt/2)); } else{ np_row = (int)(pow(2, (logt+1)/2)); np_column = (int)(pow(2, (logt-1)/2)); } log2np_row = (logt+1)/2; index_i = MYTHREAD / np_row + 1; index_j = MYTHREAD % np_row + 1; /*-------------------------------------------------------------------- c Initialize random number generator c-------------------------------------------------------------------*/ tran = 314159265.0; amult = 1220703125.0; zeta = randlc( &tran, amult ); /*-------------------------------------------------------------------- c c-------------------------------------------------------------------*/ if( statistics() ){ timer_start( 20 ); } for( i=1; i<=np_column+1; i++){ rowindex[i] = (i-1) * NA/np_column + 1; } for( i=1; i<=np_row+1; i++){ columnindex[i] = (i-1) * NA/np_row + 1; } firstrow = rowindex[index_i]; lastrow = rowindex[index_i+1] - 1; firstcol = columnindex[index_j]; lastcol = columnindex[index_j+1] - 1; makea(naa, nzz, a, colidx, rowstr, NONZER, firstrow, lastrow, firstcol, lastcol, RCOND, arow, acol, aelt, v, iv, SHIFT); if( statistics() ){ timer_stop( 20 ); } /*--------------------------------------------------------------------- c Note: as a result of the above call to makea: c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 c values of colidx which are col indexes go from firstcol --> lastcol c So: c Shift the col index vals from actual (firstcol --> lastcol ) c to local, i.e., (1 --> lastcol-firstcol+1) c---------------------------------------------------------------------*/ for ( i =1; i<= lastrow-firstrow+1; i++){ for( k =rowstr[i]; k c Do one iteration untimed to init all code and data page tables c----> (then reinit, start timing, to niter its) c-------------------------------------------------------------------*/ if( statistics() ){ timer_start( 22 ); } for (it = 1; it <= 1; it++) { /*-------------------------------------------------------------------- c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ conj_grad (colidx, rowstr, a, &zeta, &rnorm); /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ } /* end of do one iteration untimed */ /*-------------------------------------------------------------------- c set starting vector to (1, 1, .... 1) c-------------------------------------------------------------------*/ if (MYTHREAD == 0){ for (i = 1; i <= NA+1; i++) { sx[i] = 1.0; } } //synchronize all thread upc_barrier; zeta = 0.0; if( statistics() ){ timer_stop( 22 ); } /*-------------------------------------------------------------------- c----> c Main Iteration for inverse power method c----> c-------------------------------------------------------------------*/ //initialize the timer for the performance analysis if(MYTHREAD == 0){ timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); timer_clear( 4); timer_clear( 5 ); timer_clear( 6 ); timer_clear( 7 ); timer_clear( 8 ); timer_clear( 9 ); timer_clear( 10 ); timer_clear( 11 ); timer_clear( 12 ); timer_clear( 13 ); timer_clear( 14 ); timer_clear( 15 ); timer_clear( 16 ); timer_clear( 17 ); timer_clear( 18 ); timer_clear( 19 ); timer_clear( 23 ); timer_clear( 24 ); } timer_clear( 25 ); if(MYTHREAD ==0 ) timer_start( 1 ); if ( debugging() ) return 0; for (it = 1; it <=NITER; it++) { /*------------------------------------------------------------------ c The call to the conjugate gradient routine: c-------------------------------------------------------------------*/ if( statistics() ){ timer_start( 2 ); } conj_grad(colidx, rowstr, a, &zeta, &rnorm); upc_barrier; if( statistics() ){ timer_stop( 2 ); } /*-------------------------------------------------------------------- c zeta = shift + 1/(x.z) c So, first: (x.z) c Also, find norm of z c So, first: (z.z) c-------------------------------------------------------------------*/ if (MYTHREAD == 0) /*#pragma omp master*/{ if( it == 1 ) { printf(" iteration ||r|| zeta\n"); } printf(" %5d %20.14e%20.13e\n", it, rnorm, zeta); } /*-------------------------------------------------------------------- c Normalize z to obtain x c-------------------------------------------------------------------*/ } /* end of main iter inv pow meth */ if(MYTHREAD ==0 ){ timer_stop( 1 ); t = timer_read( 1 ); } /*-------------------------------------------------------------------- c End of timed section c-------------------------------------------------------------------*/ if(MYTHREAD ==0 ){ printf(" Benchmark completed\n"); epsilon = 1.0e-10; if (class != 'U') { if (fabs(zeta - zeta_verify_value) <= epsilon) { verified = TRUE; printf(" VERIFICATION SUCCESSFUL\n"); printf(" Zeta is %20.12e\n", zeta); printf(" Error is %20.12e\n", zeta - zeta_verify_value); } else { verified = FALSE; printf(" VERIFICATION FAILED\n"); printf(" Zeta %20.12e\n", zeta); printf(" The correct zeta is %20.12e\n", zeta_verify_value); } } else { verified = FALSE; printf(" Problem size unknown\n"); printf(" NO VERIFICATION PERFORMED\n"); } if ( t != 0.0 ) { mflops = (2.0*NITER*NA) * (3.0+(NONZER*(NONZER+1)) + 25.0*(5.0+(NONZER*(NONZER+1))) + 3.0 ) / t / 1000000.0; } else { mflops = 0.0; } if( statistics() ){ t1 = timer_read( 1 ); printf("total time %e \n", t1); t1 = timer_read( 2 ); printf("conj time %e \n", t1); t1 = timer_read( 3 ); printf("a * p %e \n", t1); t1 = timer_read( 4 ); printf("reduction/transpose q %e \n", t1); t1 = timer_read( 5 ); printf("calculating d %e \n", t1); t1 = timer_read( 6 ); printf("r z update %e \n", t1); t1 = timer_read( 7 ); printf("rho %e \n", t1); t1 = timer_read( 8 ); printf("update p %e \n", t1); //t1 = timer_read( 9 ); // printf("reduction z %e \n", t1); t1 = timer_read( 10 ); printf("a * z %e \n", t1); t1 = timer_read( 11 ); printf("sum %e \n", t1); //t1 = timer_read( 12); // printf("sx szupdate %e \n", t1); t1 = timer_read( 13 ); printf("sx copy in %e \n", t1); t1 = timer_read( 14 ); printf("calculate rho0 %e \n", t1 ); t1 = timer_read( 15 ); printf("calculate norm %e \n", t1); t1 = timer_read( 16 ); printf("sx copyout %e \n", t1 ); t1 = timer_read( 17 ); printf("vector reduction total %e \n", t1 ); t1 = timer_read( 18 ); printf("vector reduction - loop %e \n", t1); t1 = timer_read( 19 ); printf("transpose %e \n", t1 ); t1 = timer_read( 20 ); printf("make a %e \n", t1); t1 = timer_read( 22 ); printf("first conj_grad %e \n", t1 ); t1 = timer_read( 23 ); printf("before we get to matrix * vector %e \n", t1 ); t1 = timer_read( 24 ); printf("after we get to matrix * vector %e \n", t1 ); } c_print_results("CG", class, NA, 0, 0, NITER, nthreads, t, mflops, " floating point", verified, NPBVERSION, COMPILETIME, NPB_CS1, NPB_CS2, NPB_CS3, NPB_CS4, NPB_CS5, NPB_CS6, NPB_CS7); } fflush(NULL); /* DOB: reduce output scrambling */ upc_barrier; //finally report the time spend on A * p t1 = timer_read( 25 ); printf("the total time spend on matrix * vector is %f %d\n", t1, MYTHREAD); } /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ static void conj_grad ( int colidx[], /* colidx[1:nzz] */ int rowstr[], /* rowstr[1:naa+1] */ double a[], /* a[1:nzz] */ double *zeta_p, double *rnorm ) /*-------------------------------------------------------------------- c-------------------------------------------------------------------*/ /*--------------------------------------------------------------------- c Floaging point arrays here are named as in NPB1 spec discussion of c CG algorithm c---------------------------------------------------------------------*/ { static double d, sum, rho, rho0, alpha, beta; int i, j, k; int cgit, cgitmax = 25; static double x[NA+2+1]; /* x[1:NA+2] */ static double z[NA+2+1]; /* z[1:NA+2] */ static double p[NA+2+1]; /* p[1:NA+2] */ static double q[NA+2+1]; /* q[1:NA+2] */ static double r[NA+2+1]; /* r[1:NA+2] */ static double w[NA+2+1]; /* w[1:NA+2] */ double t0, t; int start_p; double temp1, temp2; chunk * myxfer = (chunk*)&xfer[MYTHREAD]; double * myp = myxfer->d; int tsize = columnindex[index_j+1] - columnindex[index_j]; int * temps = (int*)malloc( (np_column+2) * sizeof(int)); int * temps1 = (int*)malloc( (np_column+2) * sizeof(int)); double tbuff[3]; if (debugging() ) { printf("CG starts\n"); } /*#pragma omp single nowait*/ rho = 0.0; for (j=1; j<=np_column+1; j++){ temps[j] = j; } for (j=1; j<=np_column+1; j++){ temps1[j] = 2*j; } /*-------------------------------------------------------------------- c Initialize the CG algorithm: c-------------------------------------------------------------------*/ /*initialization which need to be updated later*/ if( statistics() ){ timer_start( 13 ); } floatcopyin2(&(x[1]), &(sx[firstcol]), tsize, BLOCK_V); for(j=1; j<= tsize; j++){ q[j] = 0.0; z[j] = 0.0; r[j] = x[j]; p[j] = r[j]; w[j] = 0.0; } upc_barrier; if (debugging() ) { printf("CG complete initialization\n"); } if( statistics() ){ timer_stop (13 ); } /*-------------------------------------------------------------------- c rho = r.r c Now, obtain the norm of r: First, sum squares of r elements locally... c-------------------------------------------------------------------*/ if( statistics() ){ timer_start( 14 ); } rho = vector_dot_product(x, x, tsize); reduction_share(&rho); if( statistics() ){ timer_stop( 14 ); } /*-------------------------------------------------------------------- c----> c The conj grad iteration loop c----> c-------------------------------------------------------------------*/ if ( DEBUGGING ) cgitmax = 2; for (cgit = 1; cgit <=cgitmax; cgit++) { rho0 = rho; d = 0.0; rho = 0.0; //printf("completed one cycle\n"); /*-------------------------------------------------------------------- c q = A.p c The partition submatrix-vector multiply: use workspace w c--------------------------------------------------------------------- C C NOTE: this version of the multiply is actually (slightly: maybe %5) C faster on the sp2 on 16 nodes than is the unrolled-by-2 version C below. On the Cray t3d, the reverse is true, i.e., the C unrolled-by-two version is some 10% faster. C The unrolled-by-8 version below is significantly faster C on the Cray t3d - overall speed of code is 1.5 times faster. */ if( statistics() ){ timer_start( 23 ); } upc_barrier; upc_barrier; upc_barrier; upc_barrier; upc_barrier; if( statistics() ){ timer_stop( 23 ); } if( statistics() ){ timer_start( 3 ); } timer_start(25); //use the memory for the reduction for(j = 1; j <=lastrow-firstrow+1; j++) { sum = 0.0; for (k = rowstr[j]; k= firstcol && jcol <= lastcol) { scale = size * v[ivelt]; for (ivelt1 = 1; ivelt1 <= nzv; ivelt1++) { irow = iv[ivelt1]; if (irow >= firstrow && irow <= lastrow) { nnza = nnza + 1; if (nnza > nz) { printf("Space for matrix elements exceeded in" " makea\n"); printf("nnza, nzmax = %d, %d\n", nnza, nz); printf("iouter = %d\n", iouter); exit(1); } acol[nnza] = jcol; arow[nnza] = irow; aelt[nnza] = v[ivelt1] * scale; } } } } size = size * ratio; } /*--------------------------------------------------------------------- c ... add the identity * rcond to the generated matrix to bound c the smallest eigenvalue from below by rcond c---------------------------------------------------------------------*/ for (i = firstrow; i <= lastrow; i++) { if (i >= firstcol && i <= lastcol) { iouter = n + i; nnza = nnza + 1; if (nnza > nz) { printf("Space for matrix elements exceeded in makea\n"); printf("nnza, nzmax = %d, %d\n", nnza, nz); printf("iouter = %d\n", iouter); exit(1); } acol[nnza] = i; arow[nnza] = i; aelt[nnza] = rcond - shift; } } /*--------------------------------------------------------------------- c ... make the sparse matrix from list of elements with duplicates c (v and iv are used as workspace) c---------------------------------------------------------------------*/ sparse(a, colidx, rowstr, n, arow, acol, aelt, firstrow, lastrow, v, &(iv[0]), &(iv[n]), nnza); } /*--------------------------------------------------- c generate a sparse matrix from a list of c [col, row, element] tri c---------------------------------------------------*/ static void sparse( double a[], /* a[1:*] */ int colidx[], /* colidx[1:*] */ int rowstr[], /* rowstr[1:*] */ int n, int arow[], /* arow[1:*] */ int acol[], /* acol[1:*] */ double aelt[], /* aelt[1:*] */ int firstrow, int lastrow, double x[], /* x[1:n] */ boolean mark[], /* mark[1:n] */ int nzloc[], /* nzloc[1:n] */ int nnza) /*--------------------------------------------------------------------- c rows range from firstrow to lastrow c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values c---------------------------------------------------------------------*/ { int nrows; int i, j, jajp1, nza, k, nzrow; double xi; /*-------------------------------------------------------------------- c how many rows of result c-------------------------------------------------------------------*/ nrows = lastrow - firstrow + 1; /*-------------------------------------------------------------------- c ...count the number of triples in each row c-------------------------------------------------------------------*/ /*#pragma omp parallel for */ for (j = 1; j <= n; j++) { rowstr[j] = 0; mark[j] = FALSE; } rowstr[n+1] = 0; for (nza = 1; nza <= nnza; nza++) { j = (arow[nza] - firstrow + 1) + 1; rowstr[j] = rowstr[j] + 1; } rowstr[1] = 1; for (j = 2; j <= nrows+1; j++) { rowstr[j] = rowstr[j] + rowstr[j-1]; } /*--------------------------------------------------------------------- c ... rowstr(j) now is the location of the first nonzero c of row j of a c---------------------------------------------------------------------*/ /*-------------------------------------------------------------------- c ... do a bucket sort of the triples on the row index c-------------------------------------------------------------------*/ for (nza = 1; nza <= nnza; nza++) { j = arow[nza] - firstrow + 1; k = rowstr[j]; a[k] = aelt[nza]; colidx[k] = acol[nza]; rowstr[j] = rowstr[j] + 1; } /*-------------------------------------------------------------------- c ... rowstr(j) now points to the first element of row j+1 c-------------------------------------------------------------------*/ for (j = nrows; j >= 1; j--) { rowstr[j+1] = rowstr[j]; } rowstr[1] = 1; /*-------------------------------------------------------------------- c ... generate the actual output rows by adding elements c-------------------------------------------------------------------*/ nza = 0; //#pragma omp parallel for for (i = 1; i <= n; i++) { x[i] = 0.0; mark[i] = FALSE; } jajp1 = rowstr[1]; for (j = 1; j <= nrows; j++) { nzrow = 0; /*-------------------------------------------------------------------- c ...loop over the jth row of a c-------------------------------------------------------------------*/ for (k = jajp1; k < rowstr[j+1]; k++) { i = colidx[k]; x[i] = x[i] + a[k]; if ( mark[i] == FALSE && x[i] != 0.0) { mark[i] = TRUE; nzrow = nzrow + 1; nzloc[nzrow] = i; } } /*-------------------------------------------------------------------- c ... extract the nonzeros of this row c-------------------------------------------------------------------*/ for (k = 1; k <= nzrow; k++) { i = nzloc[k]; mark[i] = FALSE; xi = x[i]; x[i] = 0.0; if (xi != 0.0) { nza = nza + 1; a[nza] = xi; colidx[nza] = i; } } jajp1 = rowstr[j+1]; rowstr[j+1] = nza + rowstr[1]; } } /*--------------------------------------------------------------------- c generate a sparse n-vector (v, iv) c having nzv nonzeros c c mark(i) is set to 1 if position i is nonzero. c mark is all zero on entry and is reset to all zero before exit c this corrects a performance bug found by John G. Lewis, caused by c reinitialization of mark on every one of the n calls to sprnvc ---------------------------------------------------------------------*/ static void sprnvc( int n, int nz, double v[], /* v[1:*] */ int iv[], /* iv[1:*] */ int nzloc[], /* nzloc[1:n] */ int mark[] ) /* mark[1:n] */ { int nn1; int nzrow, nzv, ii, i; double vecelt, vecloc; nzv = 0; nzrow = 0; nn1 = 1; do { nn1 = 2 * nn1; } while (nn1 < n); /*-------------------------------------------------------------------- c nn1 is the smallest power of two not less than n c-------------------------------------------------------------------*/ while (nzv < nz) { vecelt = randlc(&tran, amult); /*-------------------------------------------------------------------- c generate an integer between 1 and n in a portable manner c-------------------------------------------------------------------*/ vecloc = randlc(&tran, amult); i = icnvrt(vecloc, nn1) + 1; if (i > n) continue; /*-------------------------------------------------------------------- c was this integer generated already? c-------------------------------------------------------------------*/ if (mark[i] == 0) { mark[i] = 1; nzrow = nzrow + 1; nzloc[nzrow] = i; nzv = nzv + 1; v[nzv] = vecelt; iv[nzv] = i; } } for (ii = 1; ii <= nzrow; ii++) { i = nzloc[ii]; mark[i] = 0; } } /*--------------------------------------------------------------------- * scale a double precision number x in (0,1) by a power of 2 and chop it *---------------------------------------------------------------------*/ static int icnvrt(double x, int ipwr2) { return ((int)(ipwr2 * x)); } /*-------------------------------------------------------------------- c set ith element of sparse vector (v, iv) with c nzv nonzeros to val c-------------------------------------------------------------------*/ static void vecset( int n, double v[], /* v[1:*] */ int iv[], /* iv[1:*] */ int *nzv, int i, double val) { int k; boolean set; set = FALSE; for (k = 1; k <= *nzv; k++) { if (iv[k] == i) { v[k] = val; set = TRUE; } } if (set == FALSE) { *nzv = *nzv + 1; v[*nzv] = val; iv[*nzv] = i; } } static void reduction_share(double * value_p) { chunk * myxfer = (chunk*)&xfer[MYTHREAD]; double * myp = myxfer->d; int * temps = (int*)malloc( (np_column+2) * sizeof(int)); double tbuff[3]; int j; for (j=1; j<=np_column+1; j++){ temps[j] = j; } myp[0] = *value_p; vector_reduction(temps, tbuff); *value_p = tbuff[1]; } static void floatcopyin2(double * dest, shared [BLOCK_V] double * sorc, int size, int block_size) { int copy_size = 0; int mem_size = 0; int head = upc_phaseof(sorc); copy_size = ( (block_size - head) < (size - mem_size) ) ? block_size - head : size - mem_size; //copy the memory by block into the local copy do{ upc_memget(&dest[mem_size], &sorc[mem_size], copy_size*sizeof(double)); mem_size += copy_size; copy_size = (block_size < size - mem_size) ? block_size : size - mem_size; }while (mem_size