#include #include #include #include #include #include #include "mmio.h" #include "bupc_timers.h" /******* * * Type definitions */ typedef struct { double *data; double block_size; } poisson_jacobi_t; typedef struct csr_matrix_t { /* Dimensions, number of nonzeros * (m == n for square, m < n for a local "slice") */ int m, n, nz; /* Start of the rows owned by each thread */ int start[THREADS + 1]; /* Starts of rows owned by local processor * row_start[0] == 0 == offset of first nz in row start[MY_THREAD] */ int *row_start; /* Column indices and values of matrix elements at local processor */ int *col_idx; double *val; } csr_matrix_t; typedef struct csr_jacobi_t { int n; int block_size; double *factored_blocks; } csr_jacobi_t; #ifndef min #define min(a,b) ((a) < (b) ? (a) : (b)) #endif /* Test problem: 1-d Poisson operator * * Assume each processor has some contiguous chunk of the variables, * with processor 0 at the start and processor MYTHREADS at the end. * CAVEAT: Each processor must have *some* variables, or the code * to handle the boundaries is going to get messed up. */ void poisson_matvec(double *Ax, void *Adata, double *x, int n) { int i; static shared double left_edge[THREADS]; static shared double right_edge[THREADS]; /* Communicate the boundary values */ left_edge[MYTHREAD] = x[0]; right_edge[MYTHREAD] = x[n - 1]; /* Split-phase barrier with the local part of the computation * inside. This should take enough time to hide the communication * of the boundary values */ upc_notify; Ax[0] = 2 * x[0] - x[1]; for (i = 1; i < n - 1; ++i) Ax[i] = -x[i - 1] + 2 * x[i] - x[i + 1]; Ax[n - 1] = -x[n - 2] + 2 * x[n - 1]; upc_wait; /* Now incorporate the contributions from boundaries between * processors, synchronize, and move forward */ if (MYTHREAD > 0) Ax[0] -= right_edge[MYTHREAD - 1]; if (MYTHREAD < THREADS - 1) Ax[n - 1] -= left_edge[MYTHREAD + 1]; upc_barrier; } /**************************************** * * CSR code */ typedef struct { int i, j; double val; } matrix_entry_t; int entry_comparison(const void *e1, const void *e2) { const matrix_entry_t *entry1 = (const matrix_entry_t *) e1; const matrix_entry_t *entry2 = (const matrix_entry_t *) e2; if (entry1->i < entry2->i) return -1; else if (entry1->i > entry2->i) return 1; if (entry1->j < entry2->j) return -1; else if (entry1->j > entry2->j) return 1; return 0; } csr_matrix_t *csr_mm_load(char *filename) { int ret_code; MM_typecode matcode; FILE *f; int entry_count; int i, M, N, nz; matrix_entry_t *entries; matrix_entry_t *entry; int *row_start; int *col_index; double *val; csr_matrix_t *A; if ((f = fopen(filename, "r")) == NULL) exit(1); if (mm_read_banner(f, &matcode) != 0) { printf("Could not process Matrix Market banner.\n"); exit(1); } if (!mm_is_sparse(matcode) || !mm_is_symmetric(matcode) || !mm_is_real(matcode)) { printf("Sorry, this application does not support "); printf("Market Market type: [%s]\n", mm_typecode_to_str(matcode)); exit(1); } /* find out size of sparse matrix .... */ if ((ret_code = mm_read_mtx_crd_size(f, &M, &N, &nz)) != 0) exit(1); /* reserve memory for matrices */ row_start = (int *) malloc((M + 1) * sizeof(int)); col_index = (int *) malloc((2 * nz - M) * sizeof(int)); val = (double *) malloc((2 * nz - M) * sizeof(double)); entries = (matrix_entry_t *) malloc((2 * nz - M) * sizeof(matrix_entry_t)); /* NOTE: when reading in doubles, ANSI C requires the use of the "l" */ /* specifier as in "%lg", "%lf", "%le", otherwise errors will occur */ /* (ANSI C X3.159-1989, Sec. 4.9.6.2, p. 136 lines 13-15) */ entry = entries; entry_count = 0; for (i = 0; i < nz; i++) { int row, col; double val; fscanf(f, "%d %d %lg\n", &row, &col, &val); --row; /* adjust to 0-based */ --col; //assert(row >= 0 && col >= 0); //assert(entry_count++ < 2 * nz - M); entry->i = row; entry->j = col; entry->val = val; ++entry; if (row != col) { /* Fill out the other half... */ //assert(entry_count++ < 2 * nz - M); entry->i = col; entry->j = row; entry->val = val; ++entry; } } if (f != stdin) fclose(f); /**********************************/ /* now make CSR version of matrix */ /**********************************/ nz = 2 * nz - M; qsort(entries, nz, sizeof(matrix_entry_t), entry_comparison); entry = entries; row_start[0] = 0; for (i = 0; i < nz; ++i) { row_start[entry->i + 1] = i + 1; col_index[i] = entry->j; val[i] = entry->val; ++entry; } free(entries); A = (csr_matrix_t *) malloc(sizeof(csr_matrix_t)); A->m = M; A->n = N; A->nz = nz; A->row_start = row_start; A->col_idx = col_index; A->val = val; return A; } /* Load a sparse matrix in Matrix Market format and distribute * the values among processors */ csr_matrix_t *csr_mm_upc_load(char *filename) { csr_matrix_t *initial_matrix = NULL; csr_matrix_t *local_matrix = NULL; int *start; int i; int n_per_proc, nlocal, nzlocal; long long first_nz; long long tmp = 0, tmp1 = 0; static shared long long n; static shared long long row_start[10000]; static shared long long col_idx[100000]; static shared double val[100000]; /* Read the initial matrix at processor 0 and copy it * into shared space */ if (MYTHREAD == 0) { int nz; initial_matrix = csr_mm_load(filename); n = initial_matrix->n; nz = initial_matrix->nz; //assert(nz < 100000); //assert(n < 10000); for (i = 0; i <= n; ++i) row_start[i] = initial_matrix->row_start[i]; for (i = 0; i < nz; ++i) { col_idx[i] = initial_matrix->col_idx[i]; val[i] = initial_matrix->val[i]; } free(initial_matrix->val); free(initial_matrix->col_idx); free(initial_matrix->row_start); free(initial_matrix); } upc_barrier; /* Share out sections of the matrix to each processor */ local_matrix = (csr_matrix_t *) malloc(sizeof(csr_matrix_t)); tmp = n; local_matrix->n = (int) tmp; n_per_proc = local_matrix->n / THREADS; start = local_matrix->start; for (i = 0; i < THREADS; ++i) { start[i] = i * n_per_proc; } start[THREADS] = (int) tmp; nlocal = start[MYTHREAD + 1] - start[MYTHREAD]; local_matrix->m = nlocal; local_matrix->row_start = (int *) malloc((nlocal + 1) * sizeof(int)); first_nz = row_start[start[MYTHREAD]]; for (i = 0; i <= nlocal; ++i) { local_matrix->row_start[i] = row_start[start[MYTHREAD] + i] - first_nz; } nzlocal = row_start[start[MYTHREAD + 1]] - row_start[start[MYTHREAD]]; local_matrix->nz = nzlocal; local_matrix->col_idx = (int *) malloc(nzlocal * sizeof(int)); local_matrix->val = (double *) malloc(nzlocal * sizeof(double)); assert(local_matrix->col_idx != NULL); assert(local_matrix->val != NULL); for (i = 0; i < nzlocal; ++i) { local_matrix->col_idx[i] = col_idx[first_nz + i]; local_matrix->val[i] = val[first_nz + i]; } upc_barrier; return local_matrix; } /* Multiply by a matrix in compressed sparse row format */ void csr_matvec(double *Ax, void *Adata, double *x, int n) { int i, j; csr_matrix_t *Acsr = (csr_matrix_t *) Adata; double *Aval = Acsr->val; int *Arow = Acsr->row_start; int *Acol = Acsr->col_idx; int my_start = Acsr->start[MYTHREAD]; static shared double xglobal[100000]; /* Copy local vector section into shared memory */ for (i = 0; i < n; ++i) { xglobal[my_start + i] = x[i]; } //upc_memput(&xglobal[my_start], x, n * sizeof(double)); upc_barrier; for (i = 0; i < n; ++i) { Ax[i] = 0; for (j = Arow[i]; j < Arow[i + 1]; ++j) { int col = *Acol++; double Aelement = *Aval++; Ax[i] += Aelement * xglobal[col]; } } upc_barrier; } /********************************************** * * Dot product code */ /* Compute alpha*x + y and store in dest (daxpy-like operation) * * Arguments: * dest - destination vector. Can be the same as x or y. * alpha - scalar multiplier * x, y - vector inputs * n - vector size */ void axpy(double *dest, double alpha, double *x, double *y, int n) { int i; for (i = 0; i < n; ++i) dest[i] = alpha * x[i] + y[i]; } /* Compute the dot product of two vectors x'*y * (Each thread passes in the local part of x and y) * * Arguments: * x, y - vector inputs (local part) * n - vector size (local part) */ double ddot(double *x, double *y, int n) { int i; double final_sum = 0; double local_sum = 0; static shared double thread_sums[THREADS]; upc_notify; local_sum = 0; for (i = 0; i < n; ++i) local_sum += x[i] * y[i]; upc_wait; thread_sums[MYTHREAD] = local_sum; upc_barrier; final_sum = 0; for (i = 0; i < THREADS; ++i) final_sum += thread_sums[i]; return final_sum; } /* Solve Ax = b using a preconditioned conjugate-gradient iteration. * * Arguments: * matvec(Ax, Adata, x, n) - Multiply A*x into Ax * psolve(Minvx, Mdata, x, n) - Apply preconditioner (solve M\x into Minvx) * Adata - Opaque pointer to data used by matvec * Mdata - Opaque pointer to data used by psolve * b - System right-hand side (local part) * x - Result (local part) * rtol - Tolerance on the relative residual (||r||/||b||) * n - System size (local part) * rhist - Residual norm history. Should be at least maxiter doubles * if it is not NULL. * maxiter - Maximum number of iterations before returning * * Returns: * Iteration count on success, -1 on failure */ int precond_cg(void (*matvec) (double *Ax, void *Adata, double *x, int n), void (*psolve) (double *Minvx, void *Adata, double *x, int n), void *Adata, void *Mdata, double *b, double *x, double rtol, int n, double *rhist, int maxiter) { const int nbytes = n * sizeof(double); double bnorm2; /* ||b||^2 */ double rnorm2; /* Residual norm squared */ double rz, rzold; /* r'*z from two successive iterations */ double alpha, beta; double *s; /* Search direction */ double *r; /* Residual */ double *z; /* Temporary vector */ int i = 0; /* Current iteration */ upc_barrier; s = (double *) malloc(nbytes); r = (double *) malloc(nbytes); z = (double *) malloc(nbytes); bnorm2 = ddot(b, b, n); memset(x, 0, nbytes); memcpy(r, b, nbytes); psolve(z, Mdata, r, n); memcpy(s, z, nbytes); rz = ddot(r, z, n); rnorm2 = ddot(r, r, n); for (i = 0; i < maxiter && rnorm2 > bnorm2 * rtol * rtol; ++i) { if (rhist != NULL) rhist[i] = sqrt(rnorm2 / bnorm2); matvec(z, Adata, s, n); alpha = rz / ddot(s, z, n); axpy(x, alpha, s, x, n); axpy(r, -alpha, z, r, n); psolve(z, Mdata, r, n); rzold = rz; rz = ddot(r, z, n); rnorm2 = ddot(r, r, n); beta = -rz / rzold; axpy(s, -beta, s, z, n); } free(z); free(r); free(s); if (i >= maxiter) return -1; if (rhist != NULL) rhist[i] = sqrt(rnorm2 / bnorm2); upc_barrier; return i; } /* Dummy preconditioner... just the identity */ void dummy_psolve(double *Minvx, void *Mdata, double *x, int n) { memcpy(Minvx, x, n * sizeof(double)); } /* Main routine -- test out the code on a sample problem */ void driver(int *start, int maxiter, void (*matvec) (double *, void *, double *, int), void *Adata, void (*psolve) (double *, void *, double *, int), void *Mdata) { int N = start[THREADS] - start[0]; static shared double *shared xglobal; int n = start[MYTHREAD + 1] - start[MYTHREAD]; double *b = (double *) malloc(n * sizeof(double)); double *x = (double *) malloc(n * sizeof(double)); double *rhist = NULL; double rtol = 1e-3; int i, retval = 0; FILE *rhist_fp = NULL; FILE *x_fp = NULL; /* I/O related initialization at thread 0 */ if (MYTHREAD == 0) { #ifdef __UPC_VERSION__ // UPC version 1.1 or higher xglobal = (shared double *) upc_alloc(sizeof(double) * N); #else xglobal = (shared double *) upc_local_alloc(sizeof(double), N); #endif rhist = (double *) malloc(maxiter * sizeof(double)); //rhist_fp = fopen("rhist.out", "w"); //x_fp = fopen("x.out", "w"); } upc_barrier; /* Set up the (local) RHS */ for (i = 0; i < n; ++i) b[i] = 1; /* Do CG */ retval = precond_cg(matvec, psolve, Adata, Mdata, b, x, rtol, n, rhist, maxiter); /* Put local parts of the solution into the global space */ //for (i = 0; i < n; ++i) // xglobal[start[MYTHREAD] + i] = x[i]; upc_barrier; /* I/O related output at thread 0 */ if (MYTHREAD == 0) { //for (i = 0; i < n; ++i) // fprintf(x_fp, "%g\n", xglobal[i]); if (retval < 0) { printf("Iteration failed to converge!\n"); //for (i = 0; i < maxiter; ++i) // fprintf(rhist_fp, "%g\n", rhist[i]); } else { printf("Converged after %d iterations\n", retval); //for (i = 0; i <= retval; ++i) // fprintf(rhist_fp, "%g\n", rhist[i]); } //fclose(x_fp); //fclose(rhist_fp); free(rhist); upc_free(xglobal); } free(x); free(b); upc_barrier; } int main(int argc, char **argv) { if (MYTHREAD == 0) { printf("Processor 0!\n"); } timer_clear(0); timer_clear(1); timer_clear(2); if (argc <= 1) { const int problem_size = 500; const int block_size = 20; int i; int start[THREADS + 1]; int n_per_proc = problem_size / THREADS; if (MYTHREAD == 0) printf("Using default 1-d Poisson on a %d point mesh\n", problem_size); start[THREADS] = 500; for (i = 0; i < THREADS; ++i) start[i] = i * n_per_proc; timer_start(1); driver(start, 500, poisson_matvec, NULL, dummy_psolve, NULL); timer_stop(1); printf("time for 500 possion matrix without precond: %f\n", timer_read(1)); } else { int block_size = 60; csr_matrix_t *A; if (MYTHREAD == 0) printf("Loading...\n"); A = csr_mm_upc_load(argv[1]); if (MYTHREAD == 0) printf("Vanilla CG: "); timer_start(1); driver(A->start, 2000, csr_matvec, A, dummy_psolve, NULL); timer_stop(1); printf("time for %s with block jacobi: %f\n", argv[1], timer_read(1)); } return 0; }