/*NAS Parallel Benchmarks 2.4 UPC versions - IS This benchmark is an UPC version of the NPB IS code. The UPC versions are developed by HPCL-GWU and are derived from the OpenMP version (developed by RWCP). Permission to use, copy, distribute and modify this software for any purpose with or without fee is hereby granted. This software is provided "as is" without express or implied warranty. Information on the UPC project at GWU is available at: http://upc.gwu.edu Information on NAS Parallel Benchmarks is available at: http://www.nas.nasa.gov/Software/NPB/ --------------------------------------------------------------------*/ /* THIS VERSION IS IDENTICAL TO THE NPB2.3 VERSION */ /* PLAIN UPC VERSION */ /*-------------------------------------------------------------------- UPC version: F. Cantonnet - GWU - HPCL (fcantonn@gwu.edu) A. Agarwal - GWU - HPCL (agarwala@gwu.edu) T. El-Ghazawi - GWU - HPCL (tarek@gwu.edu) F. Vroman Authors(NAS): M. Yarrow --------------------------------------------------------------------*/ /*-------------------------------------------------------------------- This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --------------------------------------------------------------------*/ #include "is-common-1.h" INT_TYPE min_key_val, max_key_val; shared INT_TYPE *bucket_size_totals_shd; shared INT_TYPE *bucket_size_shd; shared INT_TYPE *key_buff1_shd; #include "is-common-2.h" void full_verify(){ INT_TYPE i, j, k; k = 0; /* Now, finally, sort the keys: */ for( i=0; i key_array[0] ) j++; /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ for( i=1; i key_array[i] ) j++; if( j != 0 ) printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", MYTHREAD, j ); else passed_verification_shd[MYTHREAD]++; } typedef int elem_t; void upc_reduced_sum(shared elem_t* buffer, size_t size){ shared elem_t *shared *ptrs; int thr_cnt; int p; elem_t *local_array, *lp; ptrs = (shared elem_t * shared*)upc_all_alloc(THREADS, sizeof(elem_t *)); assert( ptrs != NULL ); ptrs[MYTHREAD] = (shared elem_t *)buffer; local_array = (elem_t *)malloc(size * sizeof(elem_t)); lp = (elem_t *)buffer; upc_barrier; /* reduce */ upc_forall(thr_cnt=1; thr_cnt> shift)*THREADS)]++; /* Accumulative bucket sizes are the bucket pointers */ bucket_ptrs[0] = 0; for( i=1; i< NUM_BUCKETS; i++ ) bucket_ptrs[i] = bucket_ptrs[i-1] + bucket_size_shd[MYTHREAD + (i-1)*THREADS]; /* Sort into appropriate bucket */ for( i=0; i> shift]++)*THREADS] = key; } upc_memcpy( &bucket_size_totals_shd[MYTHREAD], &bucket_size_shd[MYTHREAD], (NUM_BUCKETS+TEST_ARRAY_SIZE) * sizeof(INT_TYPE)); TIMER_STOP( TIMER_COMPTIME ); TIMER_START( TIMER_COMMTIME ); TIMER_START( TIMER_ALLREDUCE ); /* Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */ upc_reduced_sum( &bucket_size_totals_shd[MYTHREAD], NUM_BUCKETS+TEST_ARRAY_SIZE) ; /* All other threads copy the result locally. With upc reduced only the thread 0 has the final result */ if( MYTHREAD != 0 ) upc_memcpy( &bucket_size_totals_shd[MYTHREAD], &bucket_size_totals_shd[0], (NUM_BUCKETS+TEST_ARRAY_SIZE) * sizeof(INT_TYPE)) ; TIMER_STOP( TIMER_ALLREDUCE ) ; TIMER_START( TIMER_ALLTOALL ) ; /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses NUM_KEYS (which the average number of keys per processor). Then all keys in these buckets go to processor 0. Continue accumulating again until supassing 2*NUM_KEYS. All keys in these buckets go to processor 1, etc. This algorithm guarantees that all processors have work ranking; no processors are left idle. The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested. Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each processor will have after the redistribution is done. */ bucket_sum_accumulator = 0; local_bucket_sum_accumulator = 0; send_infos_shd[0][MYTHREAD].displ = 0 ; process_bucket_distrib_ptr1[0] = 0; for( i = 0, j = 0; i < NUM_BUCKETS; i++ ){ bucket_sum_accumulator += bucket_size_totals_shd[MYTHREAD + (i * THREADS)]; local_bucket_sum_accumulator += bucket_size_shd[MYTHREAD + (i * THREADS)]; if( bucket_sum_accumulator >= (j+1)*NUM_KEYS ){ send_infos_shd[j][MYTHREAD].count = local_bucket_sum_accumulator; if( j != 0 ){ send_infos_shd[j][MYTHREAD].displ = send_infos_shd[j-1][MYTHREAD].displ + send_infos_shd[j-1][MYTHREAD].count ; process_bucket_distrib_ptr1[j] = process_bucket_distrib_ptr2[j-1]+1; } process_bucket_distrib_ptr2[j++] = i; local_bucket_sum_accumulator = 0; } } /* Equivalent to the mpi_alltoall + mpialltoallv in the c + mpi version of the NAS Parallel benchmark. */ total_displ = 0; upc_barrier; for(i = 0; i < THREADS; i++){ upc_memget(key_buff2 + total_displ, key_buff1_shd + i + send_infos_shd[MYTHREAD][i].displ * THREADS, send_infos_shd[MYTHREAD][i].count * sizeof(INT_TYPE)); total_displ += send_infos_shd[MYTHREAD][i].count; } upc_barrier; TIMER_STOP( TIMER_ALLTOALL ); TIMER_STOP( TIMER_COMMTIME ); TIMER_START( TIMER_COMPTIME ); /* The starting and ending bucket numbers on each processor are multiplied by the interval size of the buckets to obtain the smallest possible min and greatest possible max value of any key on each processor */ min_key_val = process_bucket_distrib_ptr1[MYTHREAD] << shift; max_key_val = ((process_bucket_distrib_ptr2[MYTHREAD] + 1) << shift)-1; /* Clear the work array */ upc_memset( &key_buff1_shd[MYTHREAD], 0, (max_key_val - min_key_val + 1) * sizeof(INT_TYPE)) ; /* Determine the total number of keys on all other processors holding keys of lesser value */ m = 0; for( k=0; k