/* MCOP classic matrix chain ordering problem written in UPC by Mike Merrill, mhmerrill@mac.com shared matrix layout is distributed cyclic by columns so: iCost[r][c] is on thread c%THREADS in the algorithm iCost[i][j] a function of entries in row i and column j basic approach: loop over diagonals call them t=0 to n-1 for the (i,j)th subproblem i = j-t on the diagonal compute min of the recurrance below barrier between diagonals cost is the recurrance: iCost[i][j] = min{k=i+1 to j} of (iCost[i][k-1] + iCost[k][j] + (iDims[i] * iDims[k] * iDims[j+1])) each sucsessive diagonal represents t+1 long sub-problems the first diagonal has 1 long sub-problems because they are the i==j entries these all cost 0 the second diagonal has 2 long sub-problems the third diagonal has 3 long sub-problems ... until you get to the last diagonal which has one entry :-) The innermost loop might be a little confusing because it is unrolled by a factor of four. This loop can probably be rolled up and vectorized (I think) Anyway it just more fodder for people to look at... Mike. */ /* make UPC default to relaxed references */ #include #include #include #include #include #include #include /* Macros for timing */ struct tms t; #define WSEC() (times(&t) / (double)sysconf(_SC_CLK_TCK)) #define CPUSEC() (clock() / (double)CLOCKS_PER_SEC) /* define max problem size must be <= local size * THREADS */ /* need to use compile time THREADS environment for UPC */ #define PSIZE (4096) /* dimensions of matrices */ int iDims[PSIZE+1]; shared int shared_iDims[PSIZE+1]; /* distributed cost matrix */ /* last dimension of iCost and iBest are distributed cyclicly across threads */ shared int iCost[PSIZE][PSIZE]; shared int iBest[PSIZE][PSIZE]; void print(int iPsize) { int i, j; /* all threads print out the dimensions */ printf("iDims\n"); for(i = 0; i < iPsize+1; i++) printf("%10d",iDims[i]); printf("\n"); /* print out the cost table */ printf("iCost\n"); for(i = 0; i < iPsize; i++) { for(j = 0; j < iPsize; j++) { printf("%10d",iCost[i][j]); } printf("\n"); } /* print out the best table */ printf("iBest\n"); for(i = 0; i < iPsize; i++) { for(j = 0; j < iPsize; j++) { printf("%10d",iBest[i][j]); } printf("\n"); } } void order(int i, int j) { if (i == j) { printf("%c", (char) ('A'+i)); } else { printf("("); order(i, iBest[i][j]-1); order(iBest[i][j], j); printf(")"); } } void genData(int iPsize) { int i; /* get arround the fact that rand() is not thread safe */ /* and the berkeley pthread based upc does not handle correctly */ /* have thread 0 gen and store all dims in shared array */ if(MYTHREAD == 0) for(i = 0; i < iPsize+1; i++) shared_iDims[i] = (rand() % 29) + 5; upc_barrier 1000; /* read dims into a loacal array before use */ for(i = 0; i < iPsize+1; i++) iDims[i] = shared_iDims[i]; upc_barrier 1001; } int main(int argc, char *argv[]) { int iPsize = 26; /* 'A' - 'Z' */ int i,j,k,t; int lcol, rcol, rproc; int iTemp1, iTemp; int iSub1Cost, iSub2Cost, iOldCost, iOldBest; int iSub1Cost1, iSub1Cost2, iSub1Cost3; float time1, time2; int *lpCost_ij, *lpCost_kj, *lpBest_ij; if(argc >= 2) { iPsize = atoi(argv[1]); } if((iPsize > PSIZE) || (iPsize < 2)) { printf("error: input problem size out of bounds: %d\n", iPsize); exit(1); } if(MYTHREAD == 0) { printf("Number of threads = %d\n", THREADS); printf("Max Problem Size = %d\n", PSIZE); printf("Input problem size = %d\n", iPsize); } /* generate input data for problem */ genData(iPsize); /* initialize cost table */ for(j = MYTHREAD; j < iPsize; j += THREADS) { /* fill with INT_MAX down column to diagonal */ for(i = 0; i < j; i++) iCost[i][j] = INT_MAX; /* fill diagonal with 0 */ iCost[j][j] = 0; /* fill rest with -1 */ for(i = j+1; i < iPsize; i++) iCost[i][j] = -1; } /* wait until intialization is done on all threads */ upc_barrier 1; time1 = CPUSEC(); /* loop over diagonals */ /* each sucsessive diagonal represents n long sub-problems */ /* the first diagonal has 1 long sub-problems these all cost 0 */ /* the second diagonal has 2 long sub-problems */ /* the third diagonal has 3 long sub-problems */ /* ... until you get to the last one which has one entry :-) */ /* don't need to look at diag t=0 because these all cost 0 */ for(t = 1; t < iPsize; t++) { /* start at MYTHREAD */ for(j = MYTHREAD; j < iPsize; j += THREADS) { /* does this column participate in the diagonal */ if(j >= t) { /* row for this diagonal */ i = j - t; /* local fetch */ lpCost_ij = (int *) &(iCost[i][j]); iOldCost = (*lpCost_ij); /* init best loc */ iOldBest = -1; /* UNROLLED by 4... pick up all pairs of sub problems */ for(k=i+1; k <= j-4; k += 4) { /* remote pre-fetch */ iSub1Cost = iCost[i][k-1]; /* remote pre-fetch */ iSub1Cost1 = iCost[i][k]; /* remote pre-fetch */ iSub1Cost2 = iCost[i][k+1]; /* remote pre-fetch */ iSub1Cost3 = iCost[i][k+2]; /* local fetch */ lpCost_kj = (int *) &(iCost[k][j]); iSub2Cost = (*lpCost_kj); /* compute cost of these pair of sub problems */ iTemp1 = iDims[i] * iDims[k] * iDims[j+1]; iTemp = iSub1Cost + iSub2Cost + iTemp1; /* check if less than old cost */ if(iTemp < iOldCost) { /* save values might be optimal */ iOldCost = iTemp; iOldBest = k; } /* local fetch */ lpCost_kj = (int *) &(iCost[k+1][j]); iSub2Cost = (*lpCost_kj); /* compute cost of these pair of sub problems */ iTemp1 = iDims[i] * iDims[k+1] * iDims[j+1]; iTemp = iSub1Cost1 + iSub2Cost + iTemp1; /* check if less than old cost */ if(iTemp < iOldCost) { /* save values might be optimal */ iOldCost = iTemp; iOldBest = k+1; } /* local fetch */ lpCost_kj = (int *) &(iCost[k+2][j]); iSub2Cost = (*lpCost_kj); /* compute cost of these pair of sub problems */ iTemp1 = iDims[i] * iDims[k+2] * iDims[j+1]; iTemp = iSub1Cost2 + iSub2Cost + iTemp1; /* check if less than old cost */ if(iTemp < iOldCost) { /* save values might be optimal */ iOldCost = iTemp; iOldBest = k+2; } /* local fetch */ lpCost_kj = (int *) &(iCost[k+3][j]); iSub2Cost = (*lpCost_kj); /* compute cost of these pair of sub problems */ iTemp1 = iDims[i] * iDims[k+3] * iDims[j+1]; iTemp = iSub1Cost + iSub2Cost + iTemp1; /* check if less than old cost */ if(iTemp < iOldCost) { /* save values might be optimal */ iOldCost = iTemp; iOldBest = k+3; } } /* pick up rest */ for(; k <= j; k++) { /* remote pre-fetch */ iSub1Cost = iCost[i][k-1]; /* local fetch */ lpCost_kj = (int *) &(iCost[k][j]); iSub2Cost = (*lpCost_kj); /* compute cost of these pair of sub problems */ iTemp1 = iDims[i] * iDims[k] * iDims[j+1]; iTemp = iSub1Cost + iSub2Cost + iTemp1; /* check if less than old cost */ if(iTemp < iOldCost) { /* save values might be optimal */ iOldCost = iTemp; iOldBest = k; } } /* local stores */ *lpCost_ij = iOldCost; lpBest_ij = (int *) &(iBest[i][j]); *lpBest_ij = iOldBest; } } /* make everyone wait until diagonal is done */ upc_barrier 2; } time2 = CPUSEC(); if(MYTHREAD == 0) { if(iPsize <= 26) { print(iPsize); order(0, iPsize-1); } printf("\n"); printf("best cost = %d\n", iCost[0][iPsize-1]); printf("%f seconds\n", (time2-time1)); } if(iPsize <= 10) { for(i=0;i