/*NAS Parallel Benchmarks 2.4 UPC versions - IS This benchmark is an UPC version of the NPB IS code. The UPC versions are developed by HPCL-GWU and are derived from the OpenMP version (developed by RWCP). Permission to use, copy, distribute and modify this software for any purpose with or without fee is hereby granted. This software is provided "as is" without express or implied warranty. Information on the UPC project at GWU is available at: http://upc.gwu.edu Information on NAS Parallel Benchmarks is available at: http://www.nas.nasa.gov/Software/NPB/ --------------------------------------------------------------------*/ /*THIS VERSION IS IDENTICAL TO THE NPB2.3 VERSION FULLY OPTIMIZED UPC VERSION */ /*-------------------------------------------------------------------- UPC version: F. Cantonnet - GWU - HPCL (fcantonn@gwu.edu) A. Agarwal - GWU - HPCL (agarwala@gwu.edu) T. El-Ghazawi - GWU - HPCL (tarek@gwu.edu) F. Vroman Authors(NAS): M. Yarrow --------------------------------------------------------------------*/ /*-------------------------------------------------------------------- This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --------------------------------------------------------------------*/ #include "is-common-1.h" INT_TYPE *key_buff_ptr_global; int *passed_verification; shared INT_TYPE bucket_size_totals_shd[(NUM_BUCKETS+TEST_ARRAY_SIZE) * THREADS] ; shared INT_TYPE bucket_size_shd[THREADS * (NUM_BUCKETS+TEST_ARRAY_SIZE)] ; shared INT_TYPE key_buff1_shd[THREADS * SIZE_OF_BUFFERS] ; /* Private pointers to shared buffers */ INT_TYPE *bucket_size_totals; INT_TYPE *bucket_size; INT_TYPE *key_buff1; send_info *send_infos; #include "is-common-2.h" void full_verify(){ INT_TYPE i, j, k; k = 0; /* Now, finally, sort the keys: */ for( i=0; i key_array[0] ) j++; /* Confirm keys correctly sorted: count incorrectly sorted keys, if any */ for( i=1; i key_array[i] ) j++; if( j != 0 ) printf( "Processor %d: Full_verify: number of keys out of sort: %d\n", MYTHREAD, j ); else (*passed_verification)++; } typedef int elem_t; shared elem_t *shared ptrs[THREADS]; void upc_reduced_sum(shared elem_t* buffer, size_t size){ int thr_cnt; int p; elem_t *local_array, *lp; ptrs[MYTHREAD] = (shared elem_t *)buffer; local_array = (elem_t *)malloc(size * sizeof(elem_t)); lp = (elem_t *)buffer; upc_barrier; /* reduce */ upc_forall(thr_cnt=1; thr_cnt> shift]++; /* Accumulative bucket sizes are the bucket pointers */ bucket_ptrs[0] = 0; for( i=1; i< NUM_BUCKETS; i++ ) bucket_ptrs[i] = bucket_ptrs[i-1] + bucket_size[i-1]; /* Sort into appropriate bucket */ for( i=0; i> shift]++] = key; } memcpy(bucket_size_totals, bucket_size, (NUM_BUCKETS+TEST_ARRAY_SIZE) * sizeof(INT_TYPE)); TIMER_STOP( TIMER_COMPTIME ); TIMER_START( TIMER_COMMTIME ); TIMER_START( TIMER_ALLREDUCE ); /* Get the bucket size totals for the entire problem. These will be used to determine the redistribution of keys */ upc_reduced_sum(&bucket_size_totals_shd[MYTHREAD], NUM_BUCKETS+TEST_ARRAY_SIZE) ; /* All other threads copy the result locally. With upc reduced only the thread 0 has the final result */ if( MYTHREAD != 0 ) upc_memget( bucket_size_totals, bucket_size_totals_shd, (NUM_BUCKETS+TEST_ARRAY_SIZE) * sizeof(INT_TYPE)) ; TIMER_STOP( TIMER_ALLREDUCE ) ; TIMER_START( TIMER_ALLTOALL ) ; /* Determine Redistibution of keys: accumulate the bucket size totals till this number surpasses NUM_KEYS (which the average number of keys per processor). Then all keys in these buckets go to processor 0. Continue accumulating again until supassing 2*NUM_KEYS. All keys in these buckets go to processor 1, etc. This algorithm guarantees that all processors have work ranking; no processors are left idle. The optimum number of buckets, however, does not result in as high a degree of load balancing (as even a distribution of keys as is possible) as is obtained from increasing the number of buckets, but more buckets results in more computation per processor so that the optimum number of buckets turns out to be 1024 for machines tested. Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket number of first and last bucket which each processor will have after the redistribution is done. */ bucket_sum_accumulator = 0; local_bucket_sum_accumulator = 0; send_infos[0].displ = 0 ; process_bucket_distrib_ptr1[0] = 0; for( i = 0, j = 0; i < NUM_BUCKETS; i++ ){ bucket_sum_accumulator += bucket_size_totals[i]; local_bucket_sum_accumulator += bucket_size[i]; if( bucket_sum_accumulator >= (j+1)*NUM_KEYS ){ send_infos[j].count = local_bucket_sum_accumulator; if( j != 0 ){ send_infos[j].displ = send_infos[j-1].displ + send_infos[j-1].count ; process_bucket_distrib_ptr1[j] = process_bucket_distrib_ptr2[j-1]+1; } process_bucket_distrib_ptr2[j++] = i; local_bucket_sum_accumulator = 0; } } /* Equivalent to the mpi_alltoall + mpialltoallv in the c + mpi version of the NAS Parallel benchmark. Each thread copy locally his keys from the other processors (keys redistribution). */ total_displ = 0; upc_barrier; for( i=0; i