#ifndef __UPCR_VECT_C__ #define __UPCR_VECT_C__ #define UPCRT_DEBUG_THREAD -1 #ifndef UPCRT_NO_PIPEVIS #define UPCRT_MODEL_PIPE 1 #else #define UPCRT_MODEL_PIPE 0 #endif /*The cur_* variables are valid only during the */ /*construction phase. They are used to cache the address*/ /*of the current insertion point. */ //upcrt_LoopNestVec UPCR_TLD_DEFINE_TENTATIVE(upcrt_program, sizeof(upcrt_LoopNestVec), 8); //upcrt_LoopNestVec upcrt_program; //upcrt_LoopNest *upcrt_cur_nest; //upcrt_RefDesc *upcrt_cur_ref; //upcrt_Lmad *upcrt_cur_lmad; //upcrt_DimVec *upcrt_cur_dim; //upcrt_ContigTrans upcrt_1RS1; //upcrt_FstrideTrans upcrt_1RSN; int upcrt_print_targets = 0; int upcrt_all2all = 0; /* bupc_tick_t start_anal, end_anal = 0; */ /* bupc_tick_t start_comm, end_comm = 0; */ /* bupc_tick_t start_desc, end_desc = 0; */ /* int total_calls = 0; */ #define UPCRT_S2P_PROTO_PIPE 1 #define UPCRT_S2P_PROTO_BLOCK 2 #define UPCRT_S2P_PROTO_VIS 3 #define UPCRT_S2P_PROTO_UNDEF 0 UPCRV_INLINE(Log2) int Log2(int V) { int i=1; while(V>>i > 0) i++; return i-1; } UPCRV_INLINE(V_to_idx) int V_to_idx(int V) { int i=0; while( V>>i++ > 2*8); return i-1; } #define UPCRT_S2P_SAMPLE_V 10 #define UPCRT_S2P_SAMPLE_D 10 //return < 1 - use strided interface double upcrt_s2p_speed( double profile[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D], int NMSG, int V) { if(V > 1< 1< (dbls, strips) for vec_add */ int fast_eval_smallP[][2] = { {2048, 2}, {4096, 4}, {8192,4}, {16384,4}, {32768,4}, {65535,16}, {131072,32}, {262144,32}, {524288,32}, {1048576,64}, /* the large sizes are bogus, need to apply the model and get the values */ {2097152, 128}, {4194304, 128}, {8388608, 256} }; int fast_eval_largeP[][2] = { {2048, 2}, {4096, 2}, {8192,4}, {16384,2}, {32768,4}, {65535,4}, {131072,4}, {262144,8}, {524288,8}, {1048576,8}, /* the large sizes are bogus, need to apply the model and get the values */ {2097152, 16}, {4194304, 32}, {8388608, 64} }; #define UPCRT_FIRST_LOG_DEC 11 /* is log2(fast_eval_smallP[0][0]) */ #define UPCRT_MAX_BURST_LEN 16 #define MIN_STRIP_SIZE 128 #define MAX_VOL_OUTSTANDING 100000000000 #define UPCRT_BW_PTHRESH 32 #define UPCRT_NREF_COMM 3 #define UPCRT_NOP_COMM 10 /* contour for VIS */ /* line1: Depth = 8, Vol < 64 line2: Depth - 8 = [(128-8)/(128-64)]*(Vol-64) -> Depth = 1.875*Vol - 116 (Vol > 64) Depth = S1*Vol+C1 NO VIS: (DV0 && Line(V) > D) */ /* For comp and heavy comp I'll need to extract them from the experimental data */ int V0 = 64; /* 64 */ int D0 = 8; /* 8 */ double S1 = 1.875; /* 1.875 */ double C1 = -116; /* -116 */ #endif /* JACQUARD */ #ifdef HIVE /* S [words] Latency Gap Overhead 2 7.4 1.0345 2.133447266 4 7.4 0.50225 2.133447266 8 7.4 0.254 2.133447266 16 7.4 0.15625 2.133447266 32 7.4 0.088125 2.133447266 64 7.4 0.0608125 2.133447266 128 7.4 0.045296875 2.133447266 256 7.4 0.036992188 2.317211914 512 7.4 0.033109375 2.606347656 1024 7.4 0.031145508 2.639257813 2048 7.4 0.030523926 2.66171875 4096 7.4 0.030122559 2.692578125 8192 7.4 0.030053833 2.65234375 16384 7.4 0.030411621 2.6109375 32768 7.4 0.030482819 2.771875 65536 7.4 0.030455566 2.75625 131072 7.4 0.033152969 2.8625 262144 7.4 0.030271671 2.85 524288 7.4 0.030271671 3.05 1048576 7.4 0.030271671 3.7 GAP is JUNK. */ double L = 2.4; /* G 1-to-1 */ double G[] = { 0.31, /* 2 */ 0.152, /* 4 */ 0.076125, /* 8 */ 0.0386875, /* 16 */ 0.0218125, /* 32 */ 0.01515625, /* 64 */ 0.011765625, /* 128 */ 0.009953125, /* 256 */ 0.009443359, /* 512 */ 0.009319336, /* 1024 */ 0.00927002, /* 2048 */ 0.009251953, /* 4096 */ 0.009200141, /* 8192 */ /* 009244141 */ 0.009240417, /* 16K */ 0.009238708, /* 32K */ 0.009237747, 0.009237381, 0.009237103, 0.009237103, 0.009237103 }; double o[20][8] = { {0.466955566, 0.424853516, 0.334637451, 0.355737305, 0.425683594, 0.527966309, 0.583026123, 0.609063721}, /* 2 */ {0.466955566, 0.424853516, 0.334637451, 0.355737305, 0.425683594, 0.527966309, 0.583026123, 0.609063721}, /* 4 */ {0.466955566, 0.424853516, 0.334637451, 0.355737305, 0.425683594, 0.527966309, 0.583026123, 0.609063721}, /* 8 */ {0.466955566, 0.424853516, 0.334637451, 0.355737305, 0.425683594, 0.527966309, 0.583026123, 0.609063721}, /* 16 */ {0.466955566, 0.424853516, 0.334637451, 0.355737305, 0.425683594, 0.527966309, 0.583026123, 0.609063721}, /* 32 */ {0.466955566, 0.424853516, 0.334637451, 0.355737305, 0.425683594, 0.527966309, 0.583026123, 0.609063721}, /* 64 */ {0.474816895, 0.424633789, 0.339208984, 0.380249023, 0.522937012, 0.72791748, 0.826696777, 0.875439453}, /* 128 */ {0.477001953, 0.431787109, 0.349438477, 0.393359375, 0.609350586, 0.831420898, 0.950927734, 1.00246582}, /* 256 */ {0.481689453, 0.439697266, 0.359277344, 0.428759766, 0.680078125, 0.969580078, 1.094677734, 1.163671875}, /* 512 */ {0.500195313, 0.4609375, 0.374121094, 0.468164063, 0.866308594, 1.065039063, 1.184960938, 1.243164063}, /* 1024 */ {0.530664063, 0.487890625, 0.432617188, 0.48984375, 0.849023438, 1.083007813, 1.21875, 1.2703125}, /* 2048 */ {0.63375, 0.63515625, 0.46015625, 0.505859375, 0.9171875, 1.140234375, 1.248046875, 1.28046875}, /* 4096 */ {0.8 , 0.825, 0.509375, 0.55234375, 0.90453125, 1.24140625, 1.28359375, 1.2578125}, /* 8192 */ {1.43125, 0.9890625, 0.56875, 0.7375, 1.1109375, 1.33125, 1.228125, 1.228125}, /* 16K */ {1.490625, 1.078125, 0.853125, 0.890625, 1.20625, 1.290625, 1.3, 1.3}, {1.825, 1.6875, 1.03125, 1.01875, 1.08125, 1.2, 1.3, 1.3}, {3.775, 2.2625, 0.975, 1.0375, 1.1, 1.2, 1.3, 1.3}, {4.6, 2.025, 1.7, 1.8, 1.9, 2.0, 2.0, 2.0}, {3.6, 3.7, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5}, {6.7, 6.7, 6.7, 6.7, 6.7, 6.7, 6.7, 6.7} }; /* double o[] = { 0.334637451, 0.334637451, 0.334637451, 0.334637451, 0.334637451, 0.334637451, 0.339208984, 0.349438477, 0.359277344, 0.374121094, 0.432617188, 0.46015625, 0.509375, 0.56875, 0.853125, 1.03125, 0.975, 1.7 }; */ #define UPCRT_MAX_BURST_LEN 16 #define MIN_STRIP_SIZE 64 /* for Elan there is another restriction */ /* the total volume of outstanding communication should be under 4 MB */ #define MAX_VOL_OUTSTANDING 4194304 /* bytes */ /* fast_eval -> (dbls, strips) for vec_add */ int fast_eval_smallP[][2] = { {256,2}, {512, 2}, {1024, 4}, {2048, 8}, {4096,8}, {8192, 16}, {16384,32}, {32768,32}, {65535,64}, {131072,128}, {262144,128}, {524288,256}, {1048576,512}, /* the large sizes are bogus, need to apply the model and get the values */ {2097152, 512}, {4194304, 1024}, {8388608, 2048} }; /* for hive this is meaningless since I don't have data from a large system */ int fast_eval_largeP[][2] = { {256,2}, {512, 2}, {1024, 2}, {2048, 2}, {4096, 2}, {8192,4}, {16384,4}, {32768,4}, {65535,8}, {131072,16}, {262144,16}, {524288,32}, {1048576,32}, /* the large sizes are bogus, need to apply the model and get the values */ {2097152, 64}, {4194304, 64}, {8388608, 128} }; #define UPCRT_FIRST_LOG_DEC 8 /* is log2(fast_eval_smallP[0][0]) */ #define UPCRT_BW_PTHRESH 32 #define UPCRT_NREF_COMM 3 #define UPCRT_NOP_COMM 10 /* contour for VIS */ /* line1: Depth = 16, Vol < 16 line2: Depth = Vol+16 Depth = S1*Vol+C1 NO VIS: (DV0 && Line(V) > D) */ /* These values are for redistributions only */ /* For comp and heavy comp I'll need to extract them from the experimental data */ int V0 = 16; int D0 = 16; double S1 = 1; double C1 = 16; #endif /* HIVE */ #if !defined(JACQUARD) && !defined(HIVE) && !defined(BIGBEN) #define UPCRT_BW_PTHRESH 128 #define UPCRT_NREF_COMM 2 #define UPCRT_NOP_COMM 10 #define UPCRT_FIRST_LOG_DEC 8 #define UPCRT_MAX_BURST_LEN 1 int **fast_eval_smallP; int **fast_eval_largeP; int V0, D0; double S1, C1; #endif #ifdef BIGBEN /* fast_eval -> (dbls, strips) for vec_add */ int fast_eval_smallP[][2] = { {2048, 2}, {4096,2}, {8192, 4}, {16384,8}, {32768,8}, {65535,16}, {131072,32}, {262144,16}, {524288,32}, {1048576,64}, /* the large sizes are bogus, need to apply the model and get the values */ {2097152, 64}, {4194304, 128}, {8388608, 128} }; /* these values are bogus but they work */ int fast_eval_largeP[][2] = { {2048, 2}, {4096,2}, {8192, 2}, {16384,2}, {32768,2}, {65535,2}, {131072,3}, {262144,4}, {524288,4}, {1048576,4}, /* the large sizes are bogus, need to apply the model and get the values */ {2097152, 8}, {4194304, 8}, {8388608, 8} }; double L = 10.14; double G[] = {4.671, 2.331, 1.20925, 0.604125, 0.3014375, 0.1515625, 0.0756875, 0.03778125, 0.018914063, 0.00990625, 0.007710938, 0.007289063, 0.007085938, 0.006890625, 0.006976563, 0.006890625, 0.006953125, 0.006882813, 0.00690625, 0.006882813, 0.006890625 }; /* Min of avg-o burst 1 2 4 8 16 32 64 size 1 2.178809408 3.135407308 2.834609782 3.820977783 3.800411377 3.896157046 3.959465027 2 2.152951882 3.035756614 2.819404001 3.811727998 3.801057943 3.870127586 3.941960653 4 2.009599813 2.945654975 2.819082981 3.849566465 4.322325846 4.561146828 4.683028666 8 1.995893742 2.941042856 2.756566705 3.84580559 4.34165446 4.56472394 4.689964294 16 1.989331345 3.026200045 2.647053019 3.818221399 4.340697428 4.560356988 4.693093872 32 1.907158261 2.982582131 2.455847846 3.820703495 4.360227458 4.586076751 4.708575439 64 1.821785191 3.095770107 2.255894979 3.826625755 4.382796631 4.607444481 4.731240336 128 2.132766869 2.435070801 2.162069024 3.866184304 4.365262044 4.619573749 4.737724813 256 2.190986829 2.248913574 2.177405895 3.985110677 4.419092701 4.646263631 4.749895368 512 2.009079319 2.01328125 2.065913086 3.170167824 3.899412028 4.198239862 4.457199436 1024 1.971617543 1.897246094 2.043894676 2.620747884 2.759830729 2.910899523 3.126907552 2048 1.838626302 2.021853299 2.061319987 2.296577381 2.347526042 2.416471354 2.626595052 4096 1.895138889 1.966080729 2.115959821 2.259960938 2.203645833 2.383821615 2.519053819 8192 2.252473958 2.109747024 2.180512153 2.315208333 2.403645833 2.454861111 2.424479167 16384 2.297544643 2.222569444 2.330416667 2.667838542 2.630381944 2.439583333 2.383333333 32768 2.4 2.416666667 3.06328125 3.085763889 2.69375 2.538541667 65536 2.952916667 4.00625 4.084027778 3.251041667 2.941666667 131072 6.027083333 5.493055556 4.59375 3.820833333 262144 9.522222222 6.741666667 5.666666667 524288 10 8.183333333 1048576 12.43333333 */ double o[20][8] = { { 2.178809408, 3.135407308, 2.834609782, 3.820977783, 3.800411377, 3.896157046, 3.959465027, 4}, { 2.152951882, 3.035756614, 2.819404001, 3.811727998, 3.801057943, 3.870127586, 3.941960653, 4}, { 2.009599813, 2.945654975, 2.819082981, 3.849566465, 4.322325846, 4.561146828, 4.683028666, 4.9}, { 1.995893742, 2.941042856, 2.756566705, 3.84580559, 4.34165446, 4.56472394, 4.689964294, 4.9}, { 1.989331345, 3.026200045, 2.647053019, 3.818221399, 4.340697428, 4.560356988, 4.693093872, 4.9}, { 1.907158261, 2.982582131, 2.455847846, 3.820703495, 4.360227458, 4.586076751, 4.708575439, 4.9}, { 1.821785191, 3.095770107, 2.255894979, 3.826625755, 4.382796631, 4.607444481, 4.731240336, 5}, { 2.132766869, 2.435070801, 2.162069024, 3.866184304, 4.365262044, 4.619573749, 4.737724813, 5}, { 2.190986829, 2.248913574, 2.177405895, 3.985110677, 4.419092701, 4.646263631, 4.749895368, 5}, { 2.009079319, 2.01328125, 2.065913086, 3.170167824, 3.899412028, 4.198239862, 4.457199436, 4.8}, { 1.971617543, 1.897246094, 2.043894676, 2.620747884, 2.759830729, 2.910899523, 3.126907552, 3.3}, { 1.838626302, 2.021853299, 2.061319987, 2.296577381, 2.347526042, 2.416471354, 2.626595052, 2.9}, { 1.895138889, 1.966080729, 2.115959821, 2.259960938, 2.203645833, 2.383821615, 2.519053819, 2.88}, { 2.252473958, 2.109747024, 2.180512153, 2.315208333, 2.403645833, 2.454861111, 2.424479167, 2.8}, { 2.297544643, 2.222569444, 2.330416667, 2.667838542, 2.630381944, 2.439583333, 2.383333333, 2.8}, { 2.4, 2.416666667, 3.06328125, 3.085763889, 2.69375, 2.538541667, 2.53, 2.54}, { 2.952916667, 4.00625, 4.084027778, 3.251041667, 2.941666667, 2.8, 2.8, 2.8}, { 6.027083333, 5.493055556, 4.59375, 3.820833333, 3.6, 3.5, 3.4, 3.4}, { 9.522222222, 6.741666667, 5.666666667, 5.6, 5.6, 5.6, 5.6, 5.6}, { 10, 8.183333333, 7.1, 7.1, 7.1, 7.1, 7.1, 7.1}, { 12.43333333, 10, 10, 10, 10, 10, 10, 10} }; #define UPCRT_FIRST_LOG_DEC 11 /* is log2(fast_eval_smallP[0][0]) */ #define UPCRT_MAX_BURST_LEN 64 #define MIN_STRIP_SIZE 2048 #define MAX_VOL_OUTSTANDING 100000000000 #define UPCRT_BW_PTHRESH 4 #define UPCRT_NREF_COMM 3 #define UPCRT_NOP_COMM 10 /* contour for VIS */ /* line1: Depth = 8, Vol < 64 line2: Depth - 8 = [(128-8)/(128-64)]*(Vol-64) -> Depth = 1.875*Vol - 116 (Vol > 64) Depth = S1*Vol+C1 NO VIS: (DV0 && Line(V) > D) */ /* For comp and heavy comp I'll need to extract them from the experimental data */ /* This is junk copied from Jacquard*/ int V0 = 16; /* 64 */ int D0 = 4; /* 8 */ double S1 = 1.875; /* 1.875 */ double C1 = -116; /* -116 */ #endif /* BIGBEN */ #ifdef BASSI //2T -INTER // 2 4 8 16 32 64 128 256 512 1024 double upcrt_s2p_inter[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.111111111, 1.193548387, 0.6, 0.305084746, 0.151260504, 0.08315565, 0.047418335, 0.034138655, 0.021505376, 0.015270164}, {2.058824, 1.062500, 0.606557, 0.295082, 0.161157, 0.089212, 0.061651, 0.034014, 0.022234, 0.020498 }, {1.941176, 1.062500, 0.580645, 0.327869, 0.157025, 0.108787, 0.061013, 0.038243, 0.032474, 0.021197 }, {2.000000, 1.062500, 0.580645, 0.314050, 0.218107, 0.113636, 0.065421, 0.053175, 0.032241, 0.018992 }, {2.176471, 1.062500, 0.555556, 0.422764, 0.214876, 0.126819, 0.101852, 0.056046, 0.036477, 0.023613 }, {2.166667, 1.121212, 0.833333, 0.411290, 0.279352, 0.219713, 0.116136, 0.077042, 0.060963, 0.041328 }, {2.315789, 3.285714, 0.895522, 0.692308, 0.413127, 0.239300, 0.172683, 0.148563, 0.109652, 0.062009 }, {17.800000, 4.769231, 1.038462, 0.680000, 0.435216, 0.309365, 0.267521, 0.261955, 0.143311, 0.088910 }, {17.560000, 11.645833, 1.150538, 0.770492, 0.565934, 0.502086, 0.489244, 0.510690, 0.144186, 0.129775 }, {22.866667, 24.754386, 1.285714, 0.986425, 0.843537, 0.804077, 0.839022, 0.841731, 0.203549, 0.192836 } }; //2T - INTRA double upcrt_s2p_intra[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.700000, 1.050000, 0.519481, 0.245161, 0.140065, 0.072250, 0.040197, 0.030852, 0.021373, 0.013480 }, {1.904762, 1.128205, 0.532468, 0.248366, 0.138614, 0.071895, 0.067379, 0.056013, 0.022254, 0.015022 }, {2.050000, 0.850000, 0.506494, 0.246753, 0.150327, 0.181077, 0.106296, 0.034454, 0.022869, 0.016048 }, {2.047619, 1.000000, 0.641026, 0.346154, 0.359223, 0.218597, 0.063725, 0.038635, 0.022657, 0.015416 }, {2.350000, 1.230769, 0.610390, 0.725490, 0.512903, 0.252874, 0.075720, 0.042045, 0.028372, 0.022029 }, {2.045455, 1.214286, 1.325000, 0.929487, 0.787097, 0.459677, 0.084484, 0.055488, 0.042759, 0.036427 }, {2.347826, 2.422222, 1.901235, 1.500000, 1.381538, 0.717791, 0.102446, 0.078065, 0.065658, 0.065359 }, {5.521739, 3.909091, 3.111111, 3.200000, 2.477876, 1.123348, 0.149852, 0.123303, 0.109914, 0.110074 }, {6.034483, 4.660714, 4.345455, 4.304147, 3.696970, 1.674065, 0.193188, 0.173275, 0.168184, 0.166030 }, {8.029412, 7.333333, 7.084615, 6.916031, 5.970817, 2.687439, 0.280625, 0.268628, 0.267623, 0.258752 } }; //8T INTRA double upcrt_s2p_node_intra[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.487179, 1.357616, 0.673077, 1.132353, 0.530579, 0.066694, 0.120747, 0.064599, 0.032524, 0.016280 }, {2.378378, 3.132450, 1.314935, 0.833333, 32.369601, 0.101618, 7.800622, 1.036595, 0.544827, 1.833203 }, {361.758621, 283.313333, 34.883721, 12.983766, 6.191419, 12.241394, 7.156814, 2.077578, 1.688899, 1.320548 }, {112.933333, 158.940397, 40.377419, 19.702970, 29.628289, 7.655785, 7.578313, 4.964988, 0.989998, 0.338954 }, {306.466667, 65.822368, 68.381107, 38.504092, 16.965261, 11.875776, 12.556870, 1.924774, 1.743105, 0.528097 }, {261.097561, 85.473333, 75.504983, 54.751634, 19.706173, 15.107202, 6.963417, 2.186732, 1.584494, 1.409243 }, {89.192308, 68.196203, 107.155844, 45.459504, 23.581182, 8.965674, 6.324762, 2.655144, 1.719271, 0.522914 }, {110.160494, 91.359477, 76.090615, 59.541599, 17.904801, 10.523731, 4.044587, 2.970414, 1.749480, 0.618387 }, {99.892473, 60.308901, 40.257373, 12.227642, 13.319891, 5.849352, 1.875917, 1.627133, 1.261251, 0.655449 }, {148.905660, 68.189815, 34.584726, 13.798561, 12.024625, 4.677818, 2.559940, 1.808686, 1.863794, 0.489635 } }; //16T 8PPN double upcrt_s2p_node_inter[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {614.557692, 35.690000, 14.103448, 3.606357, 1.883951, 0.976471, 0.650501, 1.193660, 0.439433, 0.221960 }, {22.865385, 9.400000, 5.691542, 3.202934, 4.362843, 0.401246, 1.318917, 0.707214, 0.460347, 0.187200 }, {2.679245, 1.372549, 1.465686, 8.544335, 2.491379, 1.784606, 0.403870, 0.200189, 0.023336, 0.621704 }, {16.962963, 0.942857, 1.626168, 0.868687, 0.834379, 9.647748, 0.268905, 0.149977, 0.077963, 1.553863 }, {12.228070, 9.371429, 4.224880, 4.487374, 2.318352, 1.504364, 11.131629, 0.642925, 0.191007, 0.626404 }, {58.236364, 46.699029, 24.622642, 17.031401, 35.795483, 1.416352, 0.691080, 0.193228, 0.120607, 0.288935 }, {6.545455, 129.616667, 14.256522, 11.798507, 31.040865, 6.872750, 8.751166, 1.560346, 0.559889, 1.366054 }, {376.830769, 377.425926, 163.428571, 183.261728, 13.362283, 6.257161, 3.086916, 4.681761, 2.407125, 0.884232 }, {515.761905, 124.461539, 131.100000, 100.659292, 22.270601, 29.806685, 10.935340, 1.690995, 0.712067, 1.426867 }, {249.800000, 67.140741, 119.845850, 62.962302, 16.291089, 4.749125, 6.224777, 3.867436, 2.284247, 1.083169 } }; // 8T - 4PPN double upcrt_s2p_node_inter_4ppn[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.857143, 0.896552, 0.601770, 0.247706, 0.143182, 0.073818, 0.043354, 0.040461, 0.023925, 0.016833 }, {2.300000, 1.160714, 0.642857, 0.278539, 0.140878, 0.084785, 0.072971, 0.041860, 0.029907, 0.020842 }, {2.206897, 1.228070, 0.621622, 0.298643, 0.150342, 0.175991, 0.075679, 0.043950, 0.028218, 0.025906 }, {2.387097, 1.125000, 0.630631, 0.328704, 0.335616, 0.156794, 0.098542, 0.054619, 0.029870, 0.020426 }, {2.258065, 1.105263, 0.612613, 0.742081, 0.365517, 0.228972, 0.151620, 0.077551, 0.032366, 0.022242 }, {2.517241, 1.438596, 2.774775, 0.941964, 0.862700, 0.361530, 0.261220, 0.078278, 0.056432, 0.040787 }, {2.967742, 5.724138, 2.345133, 2.866667, 1.498866, 0.590755, 0.371363, 0.136441, 0.086087, 0.068531 }, {11.548387, 10.786885, 3.915966, 3.056034, 0.748908, 0.761178, 0.628872, 0.242794, 0.161775, 0.095881 }, {18.729730, 14.371429, 6.536232, 2.367647, 1.576208, 1.537313, 0.584038, 0.555398, 0.334466, 0.136143 }, {38.333333, 22.494253, 10.102410, 4.182927, 2.817054, 3.517703, 2.445596, 0.976209, 0.371006, 0.371833 } }; UPCRV_INLINE(upcrt_Coalesce_is_Slow) int upcrt_Coalesce_is_Slow(int Vol, int Depth, int Thread_Distance, int *proto) { double **profile; return 1; /* return *proto = UPCRT_S2P_PROTO_VIS, 0; */ /* need to figure out what error am I allowing, right now is 10% */ if(Thread_Distance <= BUPC_THREADS_NEAR) { return (upcrt_s2p_speed(upcrt_s2p_node_intra, Depth, Vol) <= 1.0) ? ( *proto = UPCRT_S2P_PROTO_VIS, 0) : (*proto = UPCRT_S2P_PROTO_PIPE, 1); } else { return (upcrt_s2p_speed(upcrt_s2p_inter, Depth, Vol) <= 1.0) ? (*proto = UPCRT_S2P_PROTO_VIS, 0) : (*proto = UPCRT_S2P_PROTO_PIPE, 1); } } #endif /* BASSI */ #ifdef RANGER #ifdef UPCRT_S2P_SAMPLE_V #undef UPCRT_S2P_SAMPLE_V #endif #define UPCRT_S2P_SAMPLE_V 11 //2T -INTER // 2 4 8 16 32 64 128 256 512 1024 double upcrt_s2p_inter_vispipe[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.777778, 0.875000, 0.678571, 0.465116, 0.328358, 0.247788, 0.149194, 0.093690, 0.069638, 0.045517 }, {1.368421, 0.863636, 0.807692, 0.523810, 0.406250, 0.279661, 0.188841, 0.113027, 0.074039, 0.060424 }, {2.153846, 1.500000, 1.000000, 0.613636, 0.500000, 0.350427, 0.221344, 0.144231, 0.104673, 0.086676 }, {2.727273, 2.090909, 1.136364, 0.690476, 0.566667, 0.363636, 0.224409, 0.148936, 0.119777, 0.096878 }, {2.500000, 1.733333, 1.208333, 0.785714, 0.646154, 0.459459, 0.276265, 0.234951, 0.193878, 0.165304 }, {3.916667, 1.750000, 1.185185, 0.847826, 0.765625, 0.629630, 0.477178, 0.375954, 0.356164, 0.306255 }, {3.733333, 1.882353, 1.600000, 1.195122, 1.063492, 0.950000, 0.703297, 0.581240, 0.635902, 0.613521 }, {5.266667, 2.739130, 3.428571, 2.369565, 1.795455, 1.337748, 1.069565, 0.916312, 1.163005, 1.079855 }, {3.850000, 3.115385, 3.097561, 2.700000, 2.040323, 1.486891, 1.555977, 2.053435, 2.976390, 6.079186 }, {0.882353, 0.717391, 0.776119, 0.762712, 0.850679, 0.763855, 0.727273, 0.717405, 0.698236, 0.688517 }, {0.864865, 0.894737, 0.884615, 0.935961, 0.758782, 0.721489, 0.732314, 0.843233, 0.794386, 0.816266 }, }; double upcrt_s2p_inter_pipeblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {0.600000, 0.640000, 0.560000, 0.452632, 0.366120, 0.313889, 0.347826, 0.368569, 0.376705, 0.384888 }, {1.357143, 0.785714, 0.520000, 0.446809, 0.351648, 0.330532, 0.329096, 0.370476, 0.379041, 0.384875 }, {0.928571, 0.560000, 0.470588, 0.453608, 0.347826, 0.323204, 0.353352, 0.364656, 0.375307, 0.380259 }, {0.733333, 0.423077, 0.458333, 0.437500, 0.317460, 0.328804, 0.347945, 0.355816, 0.370996, 0.373841 }, {0.933333, 0.555556, 0.480000, 0.420000, 0.329949, 0.291339, 0.339498, 0.342420, 0.358974, 0.365356 }, {0.800000, 0.571429, 0.519231, 0.438095, 0.309179, 0.268657, 0.301250, 0.328527, 0.343260, 0.358511 }, {0.833333, 0.531250, 0.403226, 0.341667, 0.265823, 0.255319, 0.292605, 0.324457, 0.319708, 0.342984 }, {0.681818, 0.560976, 0.383562, 0.343284, 0.318841, 0.257240, 0.289673, 0.304273, 0.301110, 0.345527 }, {0.769231, 0.530612, 0.431579, 0.449438, 0.341598, 0.372905, 0.378049, 0.375897, 0.351365, 0.431115 }, {1.096774, 0.836364, 0.587719, 0.531532, 0.496629, 0.460599, 0.457825, 0.443921, 0.449249, 0.452683 }, {0.925000, 0.730769, 0.684211, 0.669967, 0.708126, 0.691286, 0.655662, 0.556718, 0.592762, 0.560070 }, }; double upcrt_s2p_inter_visblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {1.666667, 0.560000, 0.380000, 0.210526, 0.120219, 0.077778, 0.051893, 0.034531, 0.026233, 0.017519 }, {1.857143, 0.678571, 0.420000, 0.234043, 0.142857, 0.092437, 0.062147, 0.041874, 0.028064, 0.023256 }, {2.000000, 0.840000, 0.470588, 0.278351, 0.173913, 0.113260, 0.078212, 0.052595, 0.039284, 0.032959 }, {2.000000, 0.884615, 0.520833, 0.302083, 0.179894, 0.119565, 0.078082, 0.052994, 0.044437, 0.036217 }, {2.333333, 0.962963, 0.580000, 0.330000, 0.213198, 0.133858, 0.093791, 0.080452, 0.069597, 0.060395 }, {3.133333, 1.000000, 0.615385, 0.371429, 0.236715, 0.169154, 0.143750, 0.123511, 0.122257, 0.109796 }, {3.111111, 1.000000, 0.645161, 0.408333, 0.282700, 0.242553, 0.205788, 0.188587, 0.203303, 0.210428 }, {3.590909, 1.536585, 1.315068, 0.813433, 0.572464, 0.344123, 0.309824, 0.278809, 0.350192, 0.373119 }, {2.961538, 1.653061, 1.336842, 1.213483, 0.696970, 0.554469, 0.588235, 0.771879, 1.045800, 2.620830 }, {0.967742, 0.600000, 0.456140, 0.405405, 0.422472, 0.351831, 0.332963, 0.318471, 0.313682, 0.311680 }, {0.800000, 0.653846, 0.605263, 0.627063, 0.537313, 0.498755, 0.480150, 0.469443, 0.470882, 0.457167 }, }; //2T - INTRA double upcrt_s2p_intra_pipeblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {0.500000, 0.888889, 1.529412, 1.171429, 1.242857, 1.864286, 1.861210, 1.932624, 1.937057, 1.940186 }, {2.250000, 1.625000, 1.352941, 1.147059, 1.600000, 1.914286, 1.943262, 1.939823, 1.939876, 1.942452 }, {2.500000, 2.625000, 1.529412, 1.400000, 1.746479, 1.950355, 1.926316, 1.928070, 1.922067, 1.920753 }, {1.250000, 1.750000, 1.470588, 1.771429, 1.916667, 1.930556, 1.913495, 1.898451, 1.892704, 1.890270 }, {1.500000, 1.222222, 1.263158, 1.263158, 1.815789, 1.830065, 1.826230, 1.828711, 1.757502, 1.755061 }, {1.800000, 1.300000, 1.190476, 1.214286, 1.547619, 1.702381, 1.690476, 1.680535, 1.693908, 1.681079 }, {1.166667, 0.846154, 0.925926, 1.132075, 1.238532, 1.422018, 1.444954, 1.439080, 1.485387, 1.463037 }, {0.900000, 0.666667, 0.756098, 0.726190, 0.946429, 0.973134, 1.038806, 1.133730, 1.177712, 1.149202 }, {1.000000, 0.920000, 0.826923, 0.864078, 1.043478, 1.028916, 1.507212, 1.346154, 1.230469, 1.188212 }, {2.000000, 2.032258, 1.333333, 1.370079, 2.435294, 2.852941, 2.319608, 1.708842, 1.506824, 1.485904 }, {1.576923, 1.471698, 1.566038, 1.633803, 2.926887, 3.053927, 2.376820, 1.810169, 1.600331, 1.516116 }, }; double upcrt_s2p_intra_vispipe[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {4.750000, 1.250000, 0.384615, 0.268293, 0.160920, 0.061303, 0.047801, 0.034862, 0.027918, 0.020324 }, {0.888889, 0.769231, 0.434783, 0.307692, 0.133929, 0.078358, 0.051095, 0.046533, 0.030082, 0.025296 }, {2.100000, 0.476190, 0.461538, 0.306122, 0.161290, 0.109091, 0.072860, 0.053685, 0.044191, 0.038979 }, {4.600000, 0.857143, 0.520000, 0.274194, 0.159420, 0.107914, 0.070524, 0.058024, 0.052608, 0.043084 }, {4.833333, 1.181818, 0.666667, 0.416667, 0.195652, 0.128571, 0.104129, 0.096343, 0.089063, 0.080507 }, {5.000000, 1.230769, 0.800000, 0.490196, 0.269231, 0.195804, 0.181338, 0.169761, 0.166228, 0.169048 }, {3.285714, 1.818182, 1.040000, 0.583333, 0.407407, 0.329032, 0.346032, 0.316294, 0.298611, 0.364277 }, {10.444444, 3.071429, 1.225806, 1.032787, 0.710692, 0.693252, 0.652299, 0.577982, 0.629991, 0.642965 }, {6.833333, 3.956522, 3.581395, 2.247191, 1.754630, 1.569087, 1.075758, 1.152679, 1.211722, 1.192604 }, {2.333333, 1.587302, 1.440476, 1.017241, 0.502415, 0.658419, 0.838546, 1.105489, 1.049969, 1.027534 }, {1.121951, 0.948718, 0.963855, 0.942529, 1.246575, 1.080230, 1.200196, 1.060549, 1.004740, 0.998051 }, }; double upcrt_s2p_intra_visblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.375000, 1.111111, 0.588235, 0.314286, 0.200000, 0.114286, 0.088968, 0.067376, 0.054078, 0.039433 }, {2.000000, 1.250000, 0.588235, 0.352941, 0.214286, 0.150000, 0.099291, 0.090265, 0.058355, 0.049137 }, {5.250000, 1.250000, 0.705882, 0.428571, 0.281690, 0.212766, 0.140351, 0.103509, 0.084939, 0.074869 }, {5.750000, 1.500000, 0.764706, 0.485714, 0.305556, 0.208333, 0.134948, 0.110155, 0.099571, 0.081440 }, {7.250000, 1.444444, 0.842105, 0.526316, 0.355263, 0.235294, 0.190164, 0.176183, 0.156529, 0.141296 }, {9.000000, 1.600000, 0.952381, 0.595238, 0.416667, 0.333333, 0.306548, 0.285290, 0.281575, 0.284183 }, {3.833333, 1.538462, 0.962963, 0.660377, 0.504587, 0.467890, 0.500000, 0.455172, 0.443553, 0.532951 }, {9.400000, 2.047619, 0.926829, 0.750000, 0.672619, 0.674627, 0.677612, 0.655275, 0.741947, 0.738896 }, {6.833333, 3.640000, 2.961538, 1.941748, 1.830918, 1.614458, 1.621394, 1.551683, 1.490986, 1.417067 }, {4.666667, 3.225806, 1.920635, 1.393701, 1.223529, 1.878431, 1.945098, 1.889106, 1.582118, 1.526816 }, {1.769231, 1.396226, 1.509434, 1.539906, 3.648585, 3.298945, 2.852650, 1.919774, 1.607916, 1.513161 }, }; //INTRA double upcrt_s2p_node_intra_vispipe[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {5.926829, 1.289617, 0.318979, 0.278174, 0.262028, 0.072727, 0.034873, 0.022840, 0.014779, 0.006602 }, {1.764706, 1.216667, 0.541219, 0.383162, 0.222405, 0.168770, 0.079326, 0.028679, 0.012982, 0.008624 }, {2.625000, 1.610294, 1.274436, 0.783699, 0.513640, 0.254594, 0.106459, 0.039552, 0.019592, 0.011571 }, {1.497976, 1.584746, 1.263006, 0.964724, 0.554189, 0.291446, 0.148679, 0.076259, 0.039866, 0.030037 }, {5.774510, 3.417722, 2.237589, 1.123613, 0.582635, 0.295896, 0.150137, 0.094297, 0.086355, 0.079619 }, {10.880000, 6.333333, 3.315113, 1.581994, 0.773109, 0.371138, 0.210979, 0.184479, 0.197322, 0.118510 }, {2.712305, 2.268058, 1.818640, 1.150564, 0.619611, 0.440435, 0.391197, 0.289694, 0.221481, 0.196277 }, {2.303933, 1.817121, 1.141015, 0.754350, 0.524581, 0.454780, 0.341480, 0.332690, 0.307990, 0.276106 }, {0.923583, 0.681682, 0.852333, 0.850802, 0.415848, 0.321991, 0.349745, 0.385390, 0.369925, 0.444050 }, {1.585714, 2.597974, 9.093784, 1.745153, 1.217312, 1.150272, 1.037016, 0.994175, 0.999359, 0.997012 }, {1.366471, 13.641600, 17.318036, 2.897127, 1.154882, 1.055750, 1.003950, 1.006407, 1.000766, 0.996644 }, }; double upcrt_s2p_node_intra_pipeblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {0.418367, 2.772727, 5.915094, 3.768817, 3.763231, 3.951613, 3.867288, 3.564675, 2.926394, 2.848625 }, {3.090909, 1.967213, 2.364407, 2.552632, 2.697778, 3.819277, 3.497989, 3.282407, 2.825532, 2.786614 }, {2.666667, 2.229508, 2.273504, 2.703390, 2.729787, 3.069330, 3.204573, 2.941689, 2.760442, 2.700619 }, {7.057143, 3.630769, 2.724409, 2.488550, 2.676955, 2.732161, 2.822151, 2.643260, 2.636016, 2.545120 }, {2.684211, 2.106667, 2.088889, 2.445736, 2.458801, 2.476895, 2.517305, 2.468958, 2.418052, 2.304718 }, {2.358491, 2.035714, 1.931677, 1.851190, 1.925000, 2.065315, 2.128571, 2.181013, 2.164259, 2.095023 }, {10.685185, 6.107843, 3.658986, 3.128463, 2.983607, 2.896725, 2.817608, 2.933175, 2.968309, 2.668477 }, {10.487500, 6.718954, 4.432143, 3.552727, 3.148637, 3.287500, 3.326781, 3.314942, 3.538184, 3.461493 }, {28.302326, 13.875000, 6.114706, 3.891147, 6.616131, 7.777863, 6.985913, 6.777562, 6.528895, 4.920015 }, {25.049020, 10.855670, 1.918410, 8.980447, 9.891543, 8.783684, 8.630967, 8.470380, 8.739774, 7.305143 }, {26.226721, 1.291322, 1.011879, 4.532145, 9.501747, 9.373319, 9.181411, 9.224255, 9.202587, 9.241033 }, }; double upcrt_s2p_node_intra_visblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {2.479592, 3.575758, 1.886792, 1.048387, 0.986072, 0.287390, 0.134864, 0.081418, 0.043251, 0.018808 }, {5.454545, 2.393443, 1.279661, 0.978070, 0.600000, 0.644578, 0.277480, 0.094136, 0.036682, 0.024033 }, {7.000000, 3.590164, 2.897436, 2.118644, 1.402128, 0.781434, 0.341155, 0.116349, 0.054082, 0.031250 }, {10.571429, 5.753846, 3.440945, 2.400763, 1.483539, 0.796277, 0.419595, 0.201571, 0.105087, 0.076447 }, {15.500000, 7.200000, 4.674074, 2.748062, 1.432584, 0.732902, 0.377942, 0.232816, 0.208810, 0.183498 }, {25.660377, 12.892857, 6.403727, 2.928571, 1.488235, 0.766517, 0.449084, 0.402351, 0.427055, 0.248281 }, {28.981481, 13.852941, 6.654378, 3.599496, 1.848676, 1.275819, 1.102240, 0.849723, 0.657424, 0.523762 }, {24.162500, 12.209150, 5.057143, 2.680000, 1.651715, 1.495089, 1.136029, 1.102847, 1.089725, 0.955739 }, {26.139535, 9.458333, 5.211765, 3.310595, 2.751307, 2.504404, 2.443286, 2.612007, 2.415202, 2.184734 }, {39.720588, 28.202749, 17.445607, 15.672253, 12.041089, 10.103626, 8.950448, 8.421043, 8.734169, 7.283315 }, {35.838057, 17.615702, 17.523758, 13.130200, 10.973394, 9.895886, 9.217680, 9.283355, 9.209637, 9.210020 }, }; //INTER double upcrt_s2p_node_inter_vispipe[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {1.241026, 1.111111, 0.316860, 0.172996, 0.164464, 0.106746, 0.052963, 0.030032, 0.021857, 0.011874 }, {0.756014, 0.478846, 0.502222, 0.302594, 0.164697, 0.121190, 0.078698, 0.051465, 0.025862, 0.015525 }, {0.499033, 0.503571, 0.637184, 0.746686, 0.343958, 0.195185, 0.110967, 0.049728, 0.033289, 0.018207 }, {0.746154, 1.210983, 0.955986, 0.505109, 0.390045, 0.203119, 0.122661, 0.073975, 0.039825, 0.034641 }, {0.912162, 1.083904, 1.067126, 0.873239, 0.461651, 0.235401, 0.108335, 0.116089, 0.105849, 0.098551 }, {1.524116, 0.978916, 0.868163, 0.792254, 0.456807, 0.224456, 0.218491, 0.240294, 0.210390, 0.149483 }, {1.964237, 2.183473, 1.280397, 1.130855, 0.696266, 0.416519, 0.344736, 0.311534, 0.231476, 0.202443 }, {1.201629, 0.959699, 0.845938, 0.624645, 0.497354, 0.475915, 0.451440, 0.391222, 0.302621, 0.269124 }, {1.256784, 0.770346, 0.457782, 0.418838, 0.379974, 0.360878, 0.369542, 0.359857, 0.360361, 0.569292 }, {1.344987, 1.425705, 1.375572, 1.835755, 1.288958, 1.096287, 1.042563, 1.007778, 1.004130, 0.993154 }, {1.186916, 1.068961, 1.725974, 2.268554, 1.257897, 1.040409, 1.006499, 1.003365, 0.997276, 0.997461 }, }; double upcrt_s2p_node_inter_pipeblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {3.095238, 4.378378, 4.273292, 1.469008, 1.128418, 1.103476, 1.361917, 2.953741, 2.141186, 1.903681 }, {2.745283, 3.113772, 1.551724, 1.051515, 1.416318, 1.536265, 1.395379, 1.685000, 1.535934, 2.667961 }, {4.743119, 2.871795, 1.587393, 0.923810, 1.045736, 1.065930, 1.257442, 1.296531, 2.012661, 2.301509 }, {5.200000, 1.911602, 1.490814, 0.948753, 1.209344, 1.286670, 1.528895, 1.594229, 1.703443, 1.627890 }, {8.338028, 5.457944, 2.593750, 1.893333, 1.782552, 1.815010, 1.780195, 1.819772, 1.850268, 1.941825 }, {8.405405, 5.068702, 3.316279, 1.872527, 1.797516, 2.002012, 2.135435, 2.005258, 2.002599, 1.890883 }, {11.725806, 5.852459, 5.233766, 3.049887, 2.782910, 2.554299, 2.515825, 2.552352, 2.586092, 2.584364 }, {21.042857, 9.825503, 6.285211, 4.350970, 3.676393, 3.358974, 3.185545, 3.228651, 3.372052, 3.356427 }, {20.306122, 11.135417, 8.805699, 7.091731, 6.612724, 6.137812, 5.979750, 5.436476, 4.999221, 4.617033 }, {35.881657, 18.687500, 9.436055, 7.367412, 7.751776, 7.637968, 7.088136, 6.940241, 6.441486, 5.176790 }, {20.622905, 14.083051, 8.212397, 4.837091, 6.856994, 7.702694, 7.708375, 7.733360, 7.768490, 7.795033 }, }; double upcrt_s2p_node_inter_visblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {3.841270, 4.864865, 1.354037, 0.254132, 0.185584, 0.117791, 0.072131, 0.088707, 0.046800, 0.022604 }, {2.075472, 1.491018, 0.779310, 0.318182, 0.233264, 0.186179, 0.109813, 0.086719, 0.039722, 0.041420 }, {2.366972, 1.446154, 1.011461, 0.689796, 0.359690, 0.208054, 0.139535, 0.064474, 0.066999, 0.041904 }, {3.880000, 2.314917, 1.425197, 0.479224, 0.471698, 0.261347, 0.187535, 0.117933, 0.067840, 0.056392 }, {7.605634, 5.915888, 2.767857, 1.653333, 0.822917, 0.427255, 0.192857, 0.211256, 0.195849, 0.191369 }, {12.810811, 4.961832, 2.879070, 1.483516, 0.821118, 0.449363, 0.466574, 0.481852, 0.421326, 0.282654 }, {23.032258, 12.778689, 6.701299, 3.448980, 1.937644, 1.063914, 0.867296, 0.795144, 0.598619, 0.523188 }, {25.285714, 9.429530, 5.316901, 2.717813, 1.828470, 1.598585, 1.438081, 1.263120, 1.020453, 0.903295 }, {25.520408, 8.578125, 4.031088, 2.970284, 2.512662, 2.215000, 2.209770, 1.956352, 1.801524, 2.628442 }, {48.260355, 26.642857, 12.979969, 13.524760, 9.991713, 8.373407, 7.389831, 6.994220, 6.468090, 5.141348 }, {24.477654, 15.054237, 14.174380, 10.973203, 8.625391, 8.013950, 7.758468, 7.759382, 7.747330, 7.775237 }, }; // INTER - HALF NODE double upcrt_s2p_halfnode_inter_vispipe[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {13.157895, 0.907895, 0.422222, 0.481894, 0.234195, 0.101148, 0.062274, 0.042035, 0.023038, 0.012836 }, {1.349206, 1.106061, 0.677419, 0.411268, 0.233618, 0.185694, 0.068379, 0.051597, 0.027939, 0.018445 }, {1.557522, 0.978873, 0.622222, 0.494475, 0.193410, 0.154848, 0.095530, 0.042442, 0.037573, 0.026119 }, {1.162162, 1.020134, 0.751295, 0.298153, 0.241573, 0.178597, 0.093965, 0.051031, 0.038173, 0.044191 }, {1.490066, 1.000000, 0.586873, 0.488127, 0.244413, 0.141643, 0.116056, 0.097903, 0.108821, 0.092108 }, {3.516393, 1.418750, 0.911917, 0.638743, 0.391245, 0.231034, 0.179505, 0.193576, 0.191858, 0.177448 }, {2.697080, 0.956731, 0.852941, 0.582846, 0.253165, 0.312102, 0.298497, 0.291288, 0.273451, 0.246665 }, {4.599034, 1.394572, 0.614100, 0.355603, 0.489116, 0.514453, 0.512443, 0.450394, 0.471477, 0.459562 }, {1.957878, 1.837482, 0.707602, 0.535855, 0.714891, 0.721028, 0.740920, 0.764918, 0.794361, 0.932574 }, {1.487206, 0.720536, 0.540731, 0.588331, 0.858526, 0.899425, 0.983878, 0.985120, 1.009868, 1.104201 }, {0.597442, 0.741954, 0.785352, 0.783149, 1.047668, 1.143060, 0.967139, 1.013266, 1.012165, 1.001293 }, }; double upcrt_s2p_halfnode_inter_pipeblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {0.760000, 0.745098, 2.432432, 2.316129, 2.676923, 2.816162, 2.935175, 2.940584, 2.882490, 2.638622 }, {7.411765, 3.666667, 2.818182, 2.669173, 2.629213, 2.932203, 2.696585, 2.848407, 2.828521, 2.512926 }, {4.708333, 3.380952, 2.465753, 2.919355, 2.528986, 2.667954, 2.907757, 2.751988, 2.664671, 2.610185 }, {3.580645, 3.820513, 2.757143, 2.461039, 2.696970, 2.685437, 2.673430, 2.695610, 2.584791, 2.458454 }, {6.863636, 3.325000, 3.320513, 2.707143, 2.287540, 2.516934, 2.492458, 2.585209, 2.452376, 2.367432 }, {5.809524, 3.902439, 2.412500, 2.433121, 2.436667, 2.404643, 2.353568, 2.406661, 2.294731, 2.143320 }, {11.416667, 4.622222, 3.655914, 2.527094, 2.633333, 2.522868, 2.773369, 2.831442, 2.559065, 2.272225 }, {5.750000, 7.257576, 3.905797, 3.167235, 2.625000, 2.688252, 2.526112, 2.527511, 2.338227, 1.910509 }, {6.677083, 5.939130, 5.072034, 2.835655, 3.529265, 2.654706, 2.664153, 2.529783, 2.416934, 1.779068 }, {9.673267, 6.652582, 5.992084, 4.969128, 3.147507, 2.962305, 2.655324, 2.612954, 2.523851, 1.513361 }, {6.152778, 3.866667, 4.928125, 3.447619, 2.881525, 2.518106, 2.601308, 2.365621, 2.313937, 2.105952 }, }; double upcrt_s2p_halfnode_inter_visblock[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {10.000000, 0.676471, 1.027027, 1.116129, 0.626923, 0.284848, 0.182784, 0.123607, 0.066407, 0.033868 }, {10.000000, 4.055556, 1.909091, 1.097744, 0.614232, 0.544492, 0.184390, 0.146968, 0.079027, 0.046350 }, {7.333333, 3.309524, 1.534247, 1.443548, 0.489130, 0.413127, 0.277778, 0.116799, 0.100120, 0.068176 }, {4.161290, 3.897436, 2.071429, 0.733766, 0.651515, 0.479612, 0.251208, 0.137561, 0.098670, 0.108643 }, {10.227273, 3.325000, 1.948718, 1.321429, 0.559105, 0.356506, 0.289264, 0.253101, 0.266869, 0.218058 }, {20.428571, 5.536585, 2.200000, 1.554140, 0.953333, 0.555556, 0.422477, 0.465872, 0.440262, 0.380327 }, {30.791667, 4.422222, 3.118280, 1.472906, 0.666667, 0.787392, 0.827841, 0.824766, 0.699779, 0.560479 }, {26.444444, 10.121212, 2.398551, 1.126280, 1.283929, 1.382979, 1.294487, 1.138375, 1.102420, 0.877998 }, {13.072917, 10.913043, 3.588983, 1.519499, 2.523039, 1.914118, 1.973924, 1.935077, 1.919917, 1.659113 }, {14.386139, 4.793427, 3.240106, 2.923490, 2.702216, 2.664371, 2.612514, 2.574074, 2.548757, 1.671054 }, {3.675926, 2.868889, 3.870312, 2.700000, 3.018882, 2.878348, 2.515826, 2.397002, 2.342087, 2.108675 }, }; UPCRV_INLINE(upcrt_Coalesce_is_Slow) int upcrt_Coalesce_is_Slow(int Vol, int Depth, int Thread_Distance, int *proto) { double **profile; return (*proto = UPCRT_S2P_PROTO_VIS, 0); /* need to figure out what error am I allowing, right now is 10% */ if(Thread_Distance <= BUPC_THREADS_NEAR) { if(upcrt_s2p_speed(upcrt_s2p_intra_vispipe, Depth, Vol) < 1.0) { if(upcrt_s2p_speed(upcrt_s2p_intra_visblock, Depth, Vol) < 1.0) { *proto = UPCRT_S2P_PROTO_VIS; return 0; } else { *proto = UPCRT_S2P_PROTO_BLOCK; return 1; } } else { if(upcrt_s2p_speed(upcrt_s2p_intra_pipeblock, Depth, Vol) < 1.0) { *proto = UPCRT_S2P_PROTO_PIPE; return 1; } else { *proto = UPCRT_S2P_PROTO_BLOCK; return 1; } } } else { if(upcrt_s2p_speed(upcrt_s2p_inter_vispipe, Depth, Vol) < 1.0) { if(upcrt_s2p_speed(upcrt_s2p_inter_visblock, Depth, Vol) < 1.0) { *proto = UPCRT_S2P_PROTO_VIS; return 0; } else { *proto = UPCRT_S2P_PROTO_BLOCK; return 1; } } else { if(upcrt_s2p_speed(upcrt_s2p_inter_pipeblock, Depth, Vol) < 1.0) { *proto = UPCRT_S2P_PROTO_PIPE; return 1; } else { *proto = UPCRT_S2P_PROTO_BLOCK; return 1; } } } return (*proto = UPCRT_S2P_PROTO_UNDEF, 0); } #endif /* RANGER */ #if !defined(JACQUARD) && !defined(HIVE) && !defined(BIGBEN) && !defined(BASSI) && !defined(RANGER) UPCRV_INLINE(upcrt_Coalesce_is_Slow) int upcrt_Coalesce_is_Slow(int Vol, int Depth, int Thread_Distance, int *proto) { *proto = UPCRT_S2P_PROTO_VIS; return 0; } #endif #if defined(JACQUARD) || defined(HIVE) || defined(BIGBEN) #if defined(JACQUARD) double upcrt_s2p_inter[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {1.647059, 0.750000, 0.428571, 0.258065, 0.149701, 0.086817, 0.050325, 0.037975, 0.023789, 0.018297 }, {1.157895, 0.766667, 0.421053, 0.268817, 0.158537, 0.091195, 0.054366, 0.045340, 0.031185, 0.023712 }, {1.277778, 0.800000, 0.431034, 0.274725, 0.180124, 0.100649, 0.087248, 0.056540, 0.043314, 0.033681 }, {1.333333, 0.781250, 0.464286, 0.307692, 0.187879, 0.161290, 0.104907, 0.085985, 0.064003, 0.056145 }, {1.666667, 0.812500, 0.500000, 0.340909, 0.310127, 0.201365, 0.176166, 0.136404, 0.120231, 0.102101 }, {3.238095, 0.848485, 0.574074, 0.584270, 0.438710, 0.355401, 0.295082, 0.278140, 0.241784, 0.201491 }, {6.000000, 1.200000, 0.910714, 0.758621, 0.828947, 0.710801, 0.617060, 0.514842, 0.464980, 0.368926 }, {6.000000, 4.976190, 1.847458, 1.489796, 1.310734, 1.050296, 0.970992, 0.846213, 0.728141, 0.650487 }, {5.583333, 5.020408, 3.191781, 2.068182, 1.771429, 1.769556, 1.813392, 1.690744, 1.647323, 1.568508 }, {1.297872, 1.471429, 1.339130, 0.970297, 0.989446, 0.997297, 0.995270, 1.004775, 0.998317, 1.001675 } }; double upcrt_s2p_node_inter[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {1.269231, 0.787234, 0.377778, 0.178378, 0.085714, 0.043081, 0.023638, 0.014512, 0.011968, 0.007771 }, {0.796296, 0.780488, 0.390805, 0.189189, 0.089947, 0.046936, 0.034777, 0.023432, 0.021100, 0.011369 }, {1.625000, 1.048780, 0.379310, 0.189189, 0.111111, 0.070773, 0.055884, 0.058105, 0.030050, 0.017257 }, {2.219512, 1.833333, 0.617021, 0.256545, 0.170854, 0.125654, 0.093894, 0.079894, 0.049298, 0.030438 }, {1.871795, 2.395349, 0.977011, 0.375000, 0.278515, 0.239529, 0.186562, 0.135323, 0.087770, 0.059443 }, {3.760563, 3.883721, 1.840909, 0.754098, 0.545699, 0.454425, 0.318783, 0.221479, 0.203077, 0.153323 }, {9.894737, 4.046512, 2.348837, 1.277778, 0.874652, 0.680939, 0.481431, 0.392556, 0.354798, 0.337776 }, {9.215686, 5.469388, 3.416667, 2.254658, 1.498489, 0.944776, 0.841972, 0.931522, 0.904685, 0.898767 }, {2.125475, 5.259259, 5.228070, 3.250000, 2.573086, 2.538755, 2.552116, 2.716800, 2.780007, 2.787658 }, {5.054945, 3.228814, 1.384211, 1.024590, 1.019202, 0.962482, 1.001100, 1.018791, 1.015773, 0.999384 } }; double upcrt_s2p_node_intra[UPCRT_S2P_SAMPLE_V][UPCRT_S2P_SAMPLE_D] = { {1.650000, 0.789474, 0.400000, 0.190789, 0.095082, 0.046699, 0.025620, 0.014992, 0.011946, 0.009210 }, {1.115385, 0.805556, 0.391892, 0.197279, 0.094771, 0.049206, 0.027978, 0.021166, 0.015363, 0.012578 }, {1.035714, 0.833333, 0.391892, 0.193333, 0.104575, 0.055205, 0.040000, 0.028457, 0.023111, 0.021751 }, {1.037037, 0.694444, 0.397260, 0.212329, 0.115512, 0.078025, 0.062450, 0.047581, 0.042648, 0.041874 }, {1.148148, 0.783784, 0.418919, 0.241379, 0.178082, 0.135043, 0.104326, 0.089204, 0.082750, 0.087737 }, {1.615385, 0.837838, 0.479452, 0.368794, 0.292857, 0.222426, 0.190647, 0.180870, 0.182131, 0.186875 }, {1.555556, 0.897436, 0.867647, 0.596899, 0.482213, 0.465306, 0.421578, 0.422166, 0.399563, 0.380301 }, {1.733333, 1.108696, 1.205882, 1.036765, 1.258993, 0.869328, 0.853128, 0.887774, 0.876397, 0.897896 }, {1.914634, 1.423077, 1.837838, 1.838235, 1.953678, 1.952712, 2.038217, 2.063458, 2.058296, 1.994495 }, {1.612500, 0.723077, 0.911917, 0.903047, 0.984615, 0.994586, 0.989919, 0.999230, 1.001033, 0.998657 }, }; UPCRV_INLINE(upcrt_Coalesce_is_Slow) int upcrt_Coalesce_is_Slow(int Vol, int Depth, int Thread_Distance, int *proto) { double **profile; /*return *proto = UPCRT_S2P_PROTO_VIS, 0;*/ /* need to figure out what error am I allowing, right now is 10% */ *proto = UPCRT_S2P_PROTO_VIS; if(Thread_Distance <= BUPC_THREADS_NEAR) return /*0*/ (upcrt_s2p_speed(upcrt_s2p_node_intra, Depth, Vol) <= 1.0) ? (*proto = UPCRT_S2P_PROTO_VIS, 0) : (*proto = UPCRT_S2P_PROTO_PIPE, 1) ; else return (upcrt_s2p_speed(upcrt_s2p_node_inter, Depth, Vol) <= 1.0) ? (*proto = UPCRT_S2P_PROTO_VIS, 0) : (*proto = UPCRT_S2P_PROTO_PIPE, 1); } #else/*HIVE or BB, not JACQ*/ UPCRV_INLINE(upcrt_Coalesce_is_Slow) int upcrt_Coalesce_is_Slow(int Vol, int Depth, int Thread_Distance, int *proto) { *proto = UPCRT_S2P_PROTO_UNDEF; /*(DV0 && Line(V) > D)*/ return (Depth < D0 && Vol < V0) || (Vol > V0 && ((int)(S1*Vol+C1) > Depth)); } #endif/*!JACQ*/ static double Eval_CompBnd_Pipe(int V, int S, int b, double ctime) { double N; double nb; double time; double e; N = V/S; nb = (int)(N/b); if(S*b > MAX_VOL_OUTSTANDING || S*b > V || (2*b - 1)*o[V_to_idx(S)][Log2(b)] > L + G[V_to_idx(S)]*((double)S/8)) return 1000000000000000000000.0; //time = (o[0] + L + G[V_to_idx(S)]*((double)S/8)) + ctime + o[0] *(double)b * (nb - 2); time = (o[V_to_idx(S)][Log2(b)] + L + G[V_to_idx(S)]*((double)S/8)) + ctime + o[V_to_idx(S)][Log2(b)] *(double)b * (nb - 2); return time; } static double Eval_CompBnd_NonPipe(int V, int S, int b, double ctime) { double N; double nb; double time; double e; N = V/S; nb = (int)(N/b); if(S*b > MAX_VOL_OUTSTANDING || S*b > V ) return 1000000000000000000000.0; //time = (o[0] + L + G[V_to_idx(S)]*((double)S/8))*(double)nb + ctime; time = (o[V_to_idx(S)][Log2(b)] + L + G[V_to_idx(S)]*((double)S/8))*(double)nb + ctime; return time; } static double Eval_CommBnd_Pipe(int V, int S, int b, double ctime) { int N; int nb; double time; double e; N = V/S; nb = N/b; if(S*b > MAX_VOL_OUTSTANDING || S*b > V || (2*b - 1)*o[V_to_idx(S)][Log2(b)] > L + G[V_to_idx(S)]*((double)S/8)) return 1000000000000000000000.0; e = ((double)nb)*(ctime/ (G[V_to_idx(V)]*((double)V/8))); //time = (o[0] + L)*e + ((double)N) * G[V_to_idx(S)] * ((double)S/8) +(ctime/(double)N); time = (o[V_to_idx(S)][Log2(b)] + L)*e + ((double)N) * G[V_to_idx(S)] * ((double)S/8) +(ctime/(double)N); return time; } static double Eval_CommBnd_NonPipe(int V, int S, int b, double ctime) { int N; int nb; double time; N = V/S; nb = N/b; if(S*b > MAX_VOL_OUTSTANDING || S*b > V) return 1000000000000000000000.0; // time = (o[0] + L)*((double)nb) + ((double)N) * G[V_to_idx(S)] * ((double)S/8) + ((double)nb)*(ctime/(double)N); time = (o[V_to_idx(S)][Log2(b)] + L)*((double)nb) + ((double)N) * G[V_to_idx(S)] * ((double)S/8) + ((double)nb)*(ctime/(double)N); return time; } /* We are looking for the minimum value in the optimization space. First cut: try a descending hill heuristic. Total time decreases down(with the increase in the strip size) and to the right (with the increase in the burst length). Start with S at log2(V)/FACTOR and b=2. FACTOR depends on the ratio communication/computation. For the time being we start at S=128; Evaluate for until time starts increasing. Start increasing the burst size afterwards. */ static void Opt_Comm_Bnd(double ctime, int V, int *Sopt, int *Bopt) { int start; int b; double time; double old_time; int S; int globalS, globalb; start = 1; b = 1; globalS = 128; globalb = 1; old_time = Eval_CommBnd_Pipe(V,globalS, globalb, ctime); /* first do brute force search for the paper */ /* for(S = MIN_STRIP_SIZE; S <= V / 2; S<<=1) { for(b = 1; b <= MAX_BURST_LEN; b<<=1) { time = Eval_CommBnd_Pipe(V, S, b, ctime); if(time < old_time) { globalS = S; globalb = b; old_time = time; } } } */ b=1; old_time = Eval_CommBnd_Pipe(V, 128, b, ctime); for(S = 256; S < V/2; S<<=1) { time = Eval_CommBnd_Pipe(V, S, b, ctime); if(time > old_time) { *Sopt = S; break; } else old_time = time; } S>>=1; for(b = 4; b < UPCRT_MAX_BURST_LEN; b<<=1) { time = Eval_CommBnd_Pipe(V,S,b,ctime); if(time > old_time) { *Bopt = b; break; } else old_time = time; } } static void Opt_Comp_Bnd(double ctime, int V, int *Sopt, int *Bopt) { int start; int b; double time; double old_time; int S; int globalS, globalb; start = 1; b = 1; globalS = 128; globalb = 1; old_time = Eval_CompBnd_Pipe(V,globalS, globalb, ctime); /* first do brute force search for the paper */ /* for(S = MIN_STRIP_SIZE; S <= V / 2; S<<=1) { for(b = 1; b <= MAX_BURST_LEN; b<<=1) { time = Eval_CompBnd_Pipe(V, S, b, ctime); if(time < old_time) { globalS = S; globalb = b; old_time = time; } } } */ b=1; old_time = Eval_CompBnd_Pipe(V, 128, b, ctime); for(S = 256; S < V/2; S<<=1) { time = Eval_CompBnd_Pipe(V, S, b, ctime); if(time > old_time) { *Sopt = S; break; } else old_time = time; } S>>=1; for(b = 4; b < UPCRT_MAX_BURST_LEN; b<<=1) { time = Eval_CompBnd_Pipe(V,S,b,ctime); if(time > old_time) { *Bopt = b; break; } else old_time = time; } } #endif /* ***************************** END 1F1 Models *****************************/ static void Print_Dim( upcrt_Dim *d, int i, const char *prefix) { printf("\t%s L%d: STR = %d, SP = %d, DPTH = %d\n", prefix, i, d->stride, d->span, d->init_pos); } static void Print_DimVec(upcrt_DimVec *dvec, const char* prefix) { int i; for(i = 0; i < dvec->cnt; i++) Print_Dim(&dvec->elem[i], i, prefix); } static void Print_Lmad(upcrt_Lmad *l, int i, const char* prefix) { int j = i; char buf1[256], buf2[256]; printf("\t%s LMAD%d: ORIG DIMS = %d \t ORDER = %d\t \t REDIST = %d \t", prefix, i, l->dvec.cnt, l->order, l->redist_ref); if(l->type == UPCRT_REF_READ) printf("READ\n"); else if(l->type == UPCRT_REF_WRITE) printf("WRITE\n"); else if(l->type == UPCRT_REF_RDWR) printf("RDWR\n"); else upcrt_err("Junk lmad\n"); for(i = 0; i < l->dvec.cnt; i++) Print_Dim(&l->dvec.elem[i], i, prefix); printf("\t%s ELC = %d \t DISP = %d\n", prefix, l->total_span, l->disp); printf("\t%s REMOTE ", prefix); _bupc_dump_shared(l->remote, buf1, 256); printf("\n%s\n", buf1); printf("\t%s LOCAL ", prefix); _bupc_dump_shared(l->local, buf1, 256); printf("\n%s\n", buf1); printf("\t%s INIT POS = %d", prefix, l->init_pos); printf("\t EQUIV READ:"); for(j=0; j < l->equiv_read.cnt; j++) printf("\t%d", l->equiv_read.elem[j]); printf("\n EQUIV WRITE:"); for(j=0; j < l->equiv_write.cnt; j++) printf("\t%d", l->equiv_write.elem[j]); printf("\n\n"); } static void Print_RefDesc(upcrt_RefDesc *r, int i, const char* prefix) { int j; printf("%s REF%d\t %d PEER_REF = %d\t LMADS = %d\n", prefix, i, r->alias, r->peer_ref, r->desc.cnt); for(i = 0; i < r->desc.cnt; i++) Print_Lmad(r->desc.elem[i], i, prefix); printf("%s SUMMARY COMM\n", prefix); for(i = 0; i < r->comm.cnt; i++) Print_Lmad(r->comm.elem[i], i, prefix); } static void Print_LoopNest( upcrt_LoopNest *l) { UPCR_BEGIN_FUNCTION(); int i, j; if(upcr_mythread() == UPCRT_DEBUG_THREAD) { printf("---------------- START NEST -----------------\n"); printf("POLYTOPE DIM = %d\n", l->bounds.cnt); for(i = 0; i < l->bounds.cnt; i++) printf("\t BNDS%d (%ld : %ld : %ld)\n", i, l->bounds.elem[i].lb, l->bounds.elem[i].ub, l->bounds.elem[i].stride); printf("\nREFERENCES = %d\n", l->refs.cnt); for(i = 0; i < l->refs.cnt; i++) { Print_RefDesc(&l->refs.elem[i],i,"\t"); } printf("---------------- END NEST -----------------\n"); } } /*Try to collapse the dimensions of an Lmad*/ /*and add the resulting Lmad to the anal vector*/ /*even if the original Lmad is not simplified*/ static void Coalesce_Lmad_Dim(upcrt_Lmad *l, upcrt_LmadVec *anal) { UPCR_BEGIN_FUNCTION(); int i,j, nelem, left; upcrt_Lmad *targ; upcrt_DimVec *dv; targ = (upcrt_Lmad*) calloc(1,sizeof(upcrt_Lmad)); upcrt_assert(targ); anal->elem[anal->cnt] = targ; targ->init_pos = anal->cnt; memcpy(targ, &l->dvec, sizeof(upcrt_DimVec)); dv = &targ->dvec; nelem = dv->cnt; if(nelem == 1) goto end; qsort(&dv->elem, dv->cnt, sizeof(upcrt_Dim), &Comp_Dims); left = 0; while(dv->cnt != 1 && left < nelem) { if(dv->elem[left].span == 0) { left++; continue; } for(i = left+1; i < nelem; i++) { if(dv->elem[i].span == 0) continue; if(Can_Coalesce(&dv->elem[left],&dv->elem[i])) { // if(dv->elem[i].span > dv->elem[left].span) { if(dv->elem[i].stride == dv->elem[left].stride) dv->elem[left].span += dv->elem[i].span; else dv->elem[left].span = dv->elem[i].span; //} dv->elem[i].span = 0; dv->elem[left].init_pos = (dv->elem[left].init_pos < dv->elem[i].init_pos) ? dv->elem[left].init_pos : dv->elem[i].init_pos; dv->cnt--; } } left++; } i = 1; j = 1; /* coalescing leaves holes in the middle */ /* move everything to the left */ while(i < dv->cnt) { while(dv->elem[j].span ==0 && j < nelem) j++; if(i != j) { memcpy(&dv->elem[i], &dv->elem[j], sizeof(upcrt_Dim)); i = j; } else { i++; j++; } } end: /*update the other fields*/ targ->depth = l->depth; /*for 1-1 correspondence it still makes sense*/ targ->disp = l->disp; targ->total_span = l->total_span; targ->local = l->local; targ->remote = l->remote; targ->order = l->order; targ->type = l->type; targ->redist_ref = l->redist_ref; if(targ->type == UPCRT_REF_READ) { Add_IntElement((upcrt_IntVec*)&l->equiv_read, &targ->init_pos, sizeof(int)); Add_IntElement((upcrt_IntVec*)&targ->equiv_read, &l->init_pos, sizeof(int)); } else if(targ->type == UPCRT_REF_WRITE) { Add_IntElement((upcrt_IntVec*)&l->equiv_write, &targ->init_pos, sizeof(int)); Add_IntElement((upcrt_IntVec*)&targ->equiv_write, &l->init_pos, sizeof(int)); } else { upcrt_err("Unexpected type for initial Lmad\n"); } anal->cnt++; } /*test if to Lmads are contiguos and put the result in targ*/ /*return true/false ...*/ /* TODO: This function should be improved: it fails to collapse the references from SP/y_solve.c x[m][0][i][j] and x[m][1][i][j] */ static int Test_Lmad_Contig(upcrt_Lmad *l, upcrt_Lmad *r, upcrt_Lmad *targ, upcrt_RefDesc* ref) { int result; int i, j; int nelem; upcrt_DimVec *dvl, *dvr; upcrt_Dim *dl, *dr; int diff; uintptr_t addr_span; int test_stride; dvl = &l->dvec; dvr = &r->dvec; /*l, r should come after coalesce and the strides */ /* are sorted in increasing order*/ upcrt_assert((uintptr_t)upcr_shared_to_processlocal(l->remote) <= (uintptr_t)upcr_shared_to_processlocal(r->remote)); for(i=0; i < Elems((upcrt_Vec*)dvl) - 1; i++) { upcrt_assert(dvl->elem[i].stride <= dvl->elem[i+1].stride); } for(i=0; i < Elems((upcrt_Vec*)dvr) - 1; i++) { upcrt_assert(dvr->elem[i].stride <= dvr->elem[i+1].stride); } /*if not same number of dims, not equiv */ if((nelem = Elems((upcrt_Vec*)dvl)) != Elems((upcrt_Vec*)dvr)) return 0; dl = &dvl->elem[0]; dr = &dvr->elem[0]; diff = 0; result = 0; test_stride = dl->stride; for(i = 0; i < nelem; i++) { if(dl->stride != dr->stride || dl->span != dr-> span) { diff = i; if(result == 1) { result = 0; /*found the second difference, abort */ break; } result = 1; /* found one difference*/ test_stride = dl->stride; } dl++; dr++; } dl = &dvl->elem[diff]; dr = &dvr->elem[diff]; if((result == 0 && i>1) || dl->stride != dr->stride) return 0; /*at this point first diff is the idx that needs testing*/ addr_span = (uintptr_t) upcr_sub_psharedI(upcr_shared_to_pshared(r->remote), upcr_shared_to_pshared(l->remote), ref->esize); /*stride divides the addr_span*/ if(addr_span % test_stride) return 0; if(addr_span > (uintptr_t)dl->span + (uintptr_t) dl->stride) return 0; /*there's one more condition here if we move to symbolic representation*/ /*create the result descriptor*/ if((uintptr_t)dr->span + addr_span > dl->span) { memcpy(targ, r, sizeof(upcrt_Lmad)); targ->dvec.elem[diff].span += addr_span; targ->total_span += addr_span; targ->remote = l->remote; /* add the equiv list of l to targ */ targ->type |= l->type; for(i=0; i < Elems((upcrt_Vec*)&l->equiv_read); i++) Add_Element((upcrt_Vec*)&targ->equiv_read,&l->equiv_read.elem[i], sizeof(int)); for(i=0; i < Elems((upcrt_Vec*)&l->equiv_write); i++) Add_Element((upcrt_Vec*)&targ->equiv_write,&l->equiv_write.elem[i], sizeof(int)); } else { memcpy(targ, l, sizeof(upcrt_Lmad)); targ->type |= r->type; for(i=0; i < Elems((upcrt_Vec*)&r->equiv_read); i++) Add_Element((upcrt_Vec*)&targ->equiv_read, &r->equiv_read.elem[i], sizeof(int)); for(i=0; i < Elems((upcrt_Vec*)&r->equiv_write); i++) Add_Element((upcrt_Vec*)&targ->equiv_write, &r->equiv_write.elem[i], sizeof(int)); } /* update the start address of the equivalent lmads */ for(i = 0; i < Elems((upcrt_Vec*)&r->equiv_read); i++) { ref->desc.elem[r->equiv_read.elem[i]]->disp += addr_span; } for(i = 0; i < Elems((upcrt_Vec*)&r->equiv_write); i++) { ref->desc.elem[r->equiv_write.elem[i]]->disp += addr_span; } targ->order = Summary_Order(l->order, r->order); return 1; } /* return the strip size */ size_t upcrt_get_strips(upcr_nest_descr_t ln) { UPCR_BEGIN_FUNCTION(); size_t result; upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); result = ((*cur_nest)->comm_read[0].elem[0]->size / (*cur_nest)->comm_read[0].elem[0]->esize) ; return result; } int upcrt_get_coeff(upcr_nest_descr_t ln, upcr_ref_descr_t lref, upcr_lmad_descr_t lmad, int dim) { upcrt_err("Runtime computed coefficients not implemented yet\n"); return 1; } static int Stride_Order(int stride1, int stride2) { int order; if(stride1 < stride2) order = UPCRT_STRIDE_INCR; else if (stride1 > stride2) order = UPCRT_STRIDE_DECR; else order = UPCRT_STRIDE_INTV; return order; } /* The loops are normalized to <= in the compiler This means the span needs to be adjusted by 1 */ void _upcrt_add_sos_dim(upcr_nest_descr_t nest, upcr_ref_descr_t ref, upcr_lmad_descr_t lmad, long dim, long stride, long elc) { UPCR_BEGIN_FUNCTION(); upcrt_Dim *d; upcrt_Dim *l; int order,i; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_RefDesc **cur_ref = (upcrt_RefDesc**)UPCR_TLD_ADDR(upcrt_cur_ref); upcrt_Lmad **cur_lmad = (upcrt_Lmad**)UPCR_TLD_ADDR(upcrt_cur_lmad); upcrt_DimVec **cur_dim = (upcrt_DimVec**)UPCR_TLD_ADDR(upcrt_cur_dim); upcrt_assert((*cur_nest) == &program->elem[nest]); upcrt_assert((*cur_ref) == &(*cur_nest)->refs.elem[ref]); upcrt_assert((*cur_lmad) == (*cur_ref)->desc.elem[lmad]); upcrt_assert((*cur_dim) == &(*cur_lmad)->dvec); upcrt_assert(Elems((upcrt_Vec*)(*cur_dim)) == dim); upcrt_assert(Elems((upcrt_Vec*)(*cur_dim)) < UPCRT_MAX_DIM); d = &(*cur_dim)->elem[dim]; d->stride = stride; d->span = elc+stride; //(*cur_lmad)->total_span *= (elc+stride)/stride; d->init_pos = dim; Inc_Cnt((upcrt_Vec*)(*cur_dim)); /* Determine order : everything is easy for UGS. Question is what to do for MIVs (aka a[i+j][j]) and coupled subscripts (aka a[i+j]). At this point coupled subscripts a[c*i+c*j] are classified as INTV With this approach ((1,0), (1,N))) will be classified as UNKNOWN same for reverse order. */ order = (*cur_lmad)->order; if(dim > 0) { l = d-1; switch(order) { case UPCRT_STRIDE_UNKNOWN: if(elc && l->span) { (*cur_lmad)->order = Stride_Order(l->stride, stride); } else if (!l->span) { //keep going left until seeing span!= 0 if(dim > 1) l--; for(i=dim-2; i>0; i--) if(l->span != 0) break; else l--; if(l->span != 0) { (*cur_lmad)->order = Stride_Order(l->stride, stride); } } break; case UPCRT_STRIDE_DECR: if(elc && l->span && stride >= l->stride) { (*cur_lmad)->order = UPCRT_STRIDE_INTV; } break; case UPCRT_STRIDE_INCR: if(elc && l->span && stride <= l->stride) (*cur_lmad)->order = UPCRT_STRIDE_INTV; break; case UPCRT_STRIDE_INTV: break; default: upcrt_err("Unexpected order field in Lmad\n"); } } } char temp[222]; void upcrt_Stride_to_Pipe(upcrt_StrideNTrans *t, char *local, upcr_shared_ptr_t remote, int level, bupc_handle_t *handles, int *active) { UPCR_BEGIN_FUNCTION(); int msg; int i; upcr_shared_ptr_t rem; if(level == 1) { for(msg = 0; msg < t->count[1]; msg++) { rem = upcr_add_shared(remote, 1, msg*t->srcstride[0], 200000000); /* if(t->op_type == UPCRT_REF_READ) { handles[*active] = bupc_memget_async(local, rem, t->count[0]); } else { handles[*active] = bupc_memput_async(rem, local, t->count[0]); } *active = *active+1; */ if(t->op_type == UPCRT_REF_READ) { upcr_nbi_memget(local, rem, t->count[0]); } else { upcr_nbi_memput(rem, local, t->count[0]); } local += t->dststride[0]; } /* msg */ } else { for(msg = 0; msg < t->count[level]; msg++) { rem = upcr_add_shared(remote, 1, msg*t->srcstride[level-1], 200000000); upcrt_Stride_to_Pipe(t, local, rem, level-1, handles, active); local += t->dststride[level-1]; } } } void upcrt_Stride_to_Block(upcrt_StrideNTrans *t, char *local, upcr_shared_ptr_t remote, int level, bupc_handle_t *handles, int *active) { UPCR_BEGIN_FUNCTION(); int msg; int i; upcr_shared_ptr_t rem; if(level == 1) { for(msg = 0; msg < t->count[1]; msg++) { rem = upcr_add_shared(remote, 1, msg*t->srcstride[0], 200000000); if(t->op_type == UPCRT_REF_READ) { upcr_memget(local, rem, t->count[0]); } else { upcr_memput(rem, local, t->count[0]); } local += t->dststride[0]; } /* msg */ } else { for(msg = 0; msg < t->count[level]; msg++) { rem = upcr_add_shared(remote, 1, msg*t->srcstride[level-1], 200000000); upcrt_Stride_to_Pipe(t, local, rem, level-1, handles, active); local += t->dststride[level-1]; } } } void upcrt_Init_Comm_StrideNWrite(upcrt_StrideNTrans *t) { UPCR_BEGIN_FUNCTION(); char *local; int i, j; bupc_handle_t handles[1024]; int active = 0; int proto = UPCRT_S2P_PROTO_UNDEF; upcrt_assert((t->type & UPCRT_T_TYPE_MASK) == UPCRT_T_STRIDEN); if(t->try_model) { if(upcrt_Coalesce_is_Slow(t->contig/sizeof(double), t->chunks, bupc_thread_distance(upcr_mythread(), upcr_threadof_shared(t->remote)), &proto) ) { upcrt_assert(proto != UPCRT_S2P_PROTO_UNDEF); local = upcr_shared_to_processlocal(t->local); switch(proto) { case UPCRT_S2P_PROTO_PIPE: upcrt_Stride_to_Pipe(t, local, t->remote, t->stridelevels, handles, &active); upcr_wait_syncnbi_all(); break; case UPCRT_S2P_PROTO_BLOCK: upcrt_Stride_to_Block(t, local, t->remote, t->stridelevels, handles, &active); break; default: upcrt_assert(0); } t->handles[t->active] = BUPC_COMPLETE_HANDLE; goto inc; } } end_anal += (bupc_ticks_now() - start_anal); start_anal = bupc_ticks_now(); /* printf("%d: STRIDE %d %d to %d\n", upcr_mythread(), t->contig/sizeof(double), t->chunks, upcr_threadof_shared(t->remote)); */ t->handles[t->active] = bupc_memput_strided_async(t->remote, ((upcrt_StrideNTrans*)t)->srcstride, upcr_shared_to_processlocal(t->local), ((upcrt_StrideNTrans*)t)->dststride, ((upcrt_StrideNTrans*)t)->count, ((upcrt_StrideNTrans*)t)->stridelevels); inc: t->active++; upcrt_assert(t->active < UPCRT_QUEUE_DEPTH); } void upcrt_Init_Comm_StrideNRead(upcrt_StrideNTrans *t) { UPCR_BEGIN_FUNCTION(); char *local; int i, j; bupc_handle_t handles[1024]; int active = 0; int proto = UPCRT_S2P_PROTO_UNDEF; upcrt_assert((t->type & UPCRT_T_TYPE_MASK) == UPCRT_T_STRIDEN); if(t->try_model) { if(upcrt_Coalesce_is_Slow(t->contig/sizeof(double), t->chunks, bupc_thread_distance(upcr_mythread(), upcr_threadof_shared(t->remote)), &proto) ) { upcrt_assert(proto != UPCRT_S2P_PROTO_UNDEF); /* pipe the communication */ local = upcr_shared_to_processlocal(t->local); switch(proto) { case UPCRT_S2P_PROTO_PIPE: upcrt_Stride_to_Pipe(t, local, t->remote, t->stridelevels, handles, &active); upcr_wait_syncnbi_all(); break; case UPCRT_S2P_PROTO_BLOCK: upcrt_Stride_to_Block(t, local, t->remote, t->stridelevels, handles, &active); break; default: upcrt_assert(0); } t->handles[t->active] = BUPC_COMPLETE_HANDLE; goto inc; } } end_anal += (bupc_ticks_now() - start_anal); start_anal = bupc_ticks_now(); t->handles[t->active] = bupc_memget_strided_async(upcr_shared_to_processlocal(t->local), ((upcrt_StrideNTrans*)t)->dststride, t->remote, ((upcrt_StrideNTrans*)t)->srcstride, ((upcrt_StrideNTrans*)t)->count, ((upcrt_StrideNTrans*)t)->stridelevels); inc: t->active++; upcrt_assert(t->active < UPCRT_QUEUE_DEPTH); } void upcrt_Init_Comm_StrideN(upcrt_StrideNTrans *t) { UPCR_BEGIN_FUNCTION(); upcrt_assert((t->type & UPCRT_T_TYPE_MASK) == UPCRT_T_STRIDEN); if(UPCRT_REF_READ == t->op_type ) { t->handles[t->active] = bupc_memget_strided_async(upcr_shared_to_processlocal(t->local), ((upcrt_StrideNTrans*)t)->dststride, t->remote, ((upcrt_StrideNTrans*)t)->srcstride, ((upcrt_StrideNTrans*)t)->count, ((upcrt_StrideNTrans*)t)->stridelevels); } else if(UPCRT_REF_WRITE == t->op_type) { if(t->type & UPCRT_T_PERSISTENT_MEM) return; t->handles[t->active] = bupc_memput_strided_async(t->remote, ((upcrt_StrideNTrans*)t)->srcstride, upcr_shared_to_processlocal(t->local), ((upcrt_StrideNTrans*)t)->dststride, ((upcrt_StrideNTrans*)t)->count, ((upcrt_StrideNTrans*)t)->stridelevels); } else upcrt_err("Hybrid transfer request \n"); t->active++; upcrt_assert(t->active < UPCRT_QUEUE_DEPTH); } void _upcrt_finalize_dim(upcr_nest_descr_t ln, int dim) { UPCR_BEGIN_FUNCTION(); int i; int nelem; upcrt_TransDesc *t; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_assert((*cur_nest) == &program->elem[ln]); upcrt_assert(dim < UPCRT_MAX_DIM); nelem = Elems((upcrt_Vec*)&(*cur_nest)->comm_write[dim]); for(i=0; i < nelem; i++) { t = (*cur_nest)->comm_write[dim].elem[i]; switch(t->type & UPCRT_T_TYPE_MASK) { case UPCRT_T_CONTIG: Init_Comm_Contig((upcrt_ContigTrans*)t); break; case UPCRT_T_STRIDEN: upcrt_Init_Comm_StrideN((upcrt_StrideNTrans*)t); break; case UPCRT_T_FSTRIDE: case UPCRT_T_ILIST: default: upcrt_err("Transfer type %d (read) not implemented yet\n", t->type & UPCRT_T_TYPE_MASK); break; } } for(i=0; i < nelem; i++) { t = (*cur_nest)->comm_write[dim].elem[i]; switch(t->type& UPCRT_T_TYPE_MASK) { case UPCRT_T_CONTIG: case UPCRT_T_STRIDEN: Advance_Transfer((upcrt_TransDesc*)t); break; case UPCRT_T_FSTRIDE: case UPCRT_T_ILIST: default: upcrt_err("Transfer type %d (write) not implemented yet\n", t->type & UPCRT_T_TYPE_MASK); break; } } } /* Contig test*/ /* First sort in increasing order of the base address. Start scanning left to right and test each pair */ static void Collapse_Lmads(upcrt_LmadVec *lmads, upcrt_RefDesc *ref) { upcrt_LmadVec anal; upcrt_Lmad *upcrt_cur_lmad,*left,*right,*scratch; upcrt_Lmad **targ; int i,cwp,coal; memcpy(&anal, lmads, sizeof(upcrt_LmadVec)); qsort(anal.elem, Elems((upcrt_Vec*)lmads), sizeof(upcrt_Lmad*), &Comp_Base_Addr); targ = (upcrt_Lmad**)lmads->elem; left = anal.elem[0]; scratch = (upcrt_Lmad*)calloc(1,sizeof(upcrt_Lmad)); i = 1; cwp = 0; coal = 0; while(i < Elems((upcrt_Vec*)&anal)) { right = anal.elem[i]; if((coal = Test_Lmad_Contig(left,right,scratch,ref))) { /*memset(left, 0, sizeof(upcrt_Lmad));*/ upcrt_cur_lmad = left; left = scratch; scratch = upcrt_cur_lmad; } else { *targ = left; targ++; left = right; cwp++; } i++; } *targ = left; lmads->cnt = cwp+1; } UPCRV_INLINE(Update_Lmad_Span) void Update_Lmad_Span(upcrt_Lmad * lmad) { int i; upcrt_DimVec *dv = &lmad->dvec; lmad->total_span = 1; for(i=0; i < Elems((upcrt_Vec*)dv); i++) { lmad->total_span *= dv->elem[i].span/dv->elem[i].stride; } } /* handle M-d to N-d redistributions */ /* see SP/y_solve.c, class A, 4 procs (1,1023)(2178,10890) -> (1,33)(1089, 33759)(35937,179685) */ static int Test_NStrideMxN(upcrt_DimVec * dvc, upcrt_DimVec * redist, upcrt_StrideNTrans * c_plan, size_t esize) { int i; int sl, start, prev_stride, dl; int reshape = 0; int swap = 0; upcrt_DimVec *tvec = 0; char buf[sizeof(size_t)* UPCRT_MAX_DIM]; upcrt_DimVec *L, *R, *D; size_t Lst[UPCRT_MAX_DIM], Rst[UPCRT_MAX_DIM]; #define TRIP(X) (X)->span/(X)->stride #define ErrorMxN() printf("Generic N-dim to M-dim (%d->%d) redistributions not implemented\n", \ Elems((upcrt_Vec*)redist), Elems((upcrt_Vec*)dvc)); \ for(i = 0; i < Elems((upcrt_Vec*)dvc); i++) {\ printf("(%d,%d)\t", dvc->elem[i].stride, redist->elem[i].stride); \ } \ printf("\n"); \ return 0; upcrt_assert(redist); /* The assumption is that in the vector with fewer dimension there is one dimension that covers completely one of the dimensions in the other vector, e.g. if (stride1, span1) appears in the short vector, (stride1, span2) needs to be in the longer vector, where span2 divides span1 */ /* inner stride is 1 in both vectors*/ /* only first dimension needs to be split */ if(Elems((upcrt_Vec*)dvc) < Elems((upcrt_Vec*)redist)) { swap = 1; L = dvc; dvc = redist; redist = L; } if((dvc->elem[0].stride != 1 && redist->elem[0].stride != 1) || redist->elem[0].span % dvc->elem[0].span) { ErrorMxN(); } c_plan->count[0] = dvc->elem[0].span *esize; c_plan->srcstride[0] = dvc->elem[1].stride * esize; c_plan->dststride[0] = c_plan->count[0]; sl = 1; prev_stride = 1; dl = 1; for(i=1; i < Elems((upcrt_Vec*)dvc) - 1; i++, sl++) { if(dvc->elem[i].stride % prev_stride) return 0; prev_stride = dvc->elem[i].stride; c_plan->count[sl] = dvc->elem[i].span/prev_stride; c_plan->srcstride[sl] = dvc->elem[i+1].stride * esize; if(i==1) c_plan->dststride[sl] = redist->elem[1].stride*esize; else c_plan->dststride[sl] = c_plan->dststride[sl-1] * c_plan->count[sl]; } c_plan->count[i] = dvc->elem[i].span/dvc->elem[i].stride; c_plan->stridelevels = i+1; c_plan->stridelevels = i; /* if(c_plan->stridelevels == 3 && upcrt_print_targets) { */ /* Print_DimVec(dvc, "&&&"); */ /* Print_DimVec(redist, "+++"); */ /* } */ if(swap) { memcpy(Lst, c_plan->dststride, sizeof(size_t)*(c_plan->stridelevels+1)); memcpy(c_plan->dststride, c_plan->srcstride, sizeof(size_t)*(c_plan->stridelevels+1)); memcpy(c_plan->srcstride, Lst, sizeof(size_t)*(c_plan->stridelevels+1)); } return 1; } /* This handles arbitrary N-d to N-d redistributions. At this point the code tries to figure out how to move the data using one VIS call. However, this may not be optimal. See z_unpack_backsub_info in NAS-BT which does the following transfer ((1,5)(40,2240)) -> ((1,35), (40,280)). In this particular case, I have a strong feeling might be better to generate VIS calls that move the 35 elements. (Operate on contiguous regions when they are large enough.) */ static int Test_NStrideNxN(upcrt_DimVec * dvc, upcrt_DimVec * redist, upcrt_StrideNTrans * c_plan, size_t esize) { int i; int sl, start, prev_stride; int reshape = 0; int swap = 0; upcrt_DimVec *tvec = 0; char buf[sizeof(size_t)* UPCRT_MAX_DIM]; upcrt_DimVec *L, *R, *D; size_t Lst[UPCRT_MAX_DIM], Rst[UPCRT_MAX_DIM]; #define TRIP(X) (X)->span/(X)->stride #define ErrorNxN() printf("Generic N-dim to N-dim (%d->%d) redistributions not implemented\n", \ Elems((upcrt_Vec*)redist), Elems((upcrt_Vec*)dvc)); \ for(i = 0; i < Elems((upcrt_Vec*)dvc); i++) {\ printf("(%d,%d)\t", dvc->elem[i].stride, redist->elem[i].stride); \ } \ printf("\n"); \ return 0; if(dvc->elem[0].stride == redist->elem[0].stride && dvc->elem[0].span == redist->elem[0].span) { /* the code assumes that the first lmad is the one that gets split */ return 0; } c_plan->try_model = 0; c_plan->count[0] = esize; if(TRIP(&dvc->elem[0]) > TRIP(&redist->elem[0])) { swap = 1; L = redist; R = dvc; } else { L = dvc; R= redist; } if(L->elem[0].stride == 1 && R->elem[0].stride == 1) { if(R->elem[0].span % L->elem[0].span == 0) { c_plan->try_model = UPCRT_MODEL_PIPE; c_plan->count[0] = L->elem[0].span * esize; Lst[0] = L->elem[1].stride*esize; Rst[0] = c_plan->count[0]; c_plan->count[1] = R->elem[0].span/L->elem[0].span; Lst[1] = c_plan->count[1] * L->elem[1].stride*esize + L->elem[1].stride*esize; Rst[1] = R->elem[1].stride * esize; c_plan->count[2] = R->elem[1].span / R->elem[1].stride; Lst[2] = 0; Rst[2] = 0; /* This code handles the case (1,17)(289,24565) -> (1,272)(578,2890) which assumes that collapsed dimensions are adjacent and occur in positions 0, 1. Everything else afterwards has to match, otherwise we don't handle */ for(i = 2; i < dvc->cnt; i++) { ErrorNxN(); if(TRIP(&L->elem[i]) != TRIP(&R->elem[i])) { ErrorNxN(); } else { ErrorNxN(); /* untested, not encountered in practice yet */ Lst[i] = L->elem[i].stride * esize; Rst[i] = R->elem[i].stride * esize; c_plan->count[i+1] = TRIP(&L->elem[i]); } } } else { ErrorNxN(); } c_plan->stridelevels = 2; } else { c_plan->try_model = UPCRT_MODEL_PIPE; Lst[0] = L->elem[0].stride*esize; Rst[0] = R->elem[0].stride*esize; c_plan->count[1] = L->elem[0].span/L->elem[0].stride; Lst[1] = L->elem[1].stride*esize; Rst[1] = R->elem[0].stride * c_plan->count[1]*esize; c_plan->count[2] = TRIP(&(R->elem[0])) / TRIP(&(L->elem[0])); Lst[2] = 0; Rst[2] = 0; for(i = 1; i <= dvc->cnt; i++) { Lst[i+1] = c_plan->count[i+1]*Lst[i]; Rst[i+1] = c_plan->count[i+1]*Rst[i]; c_plan->count[i+2] = (TRIP(&L->elem[i]) > TRIP(&R->elem[i])) ? TRIP(&L->elem[i])/c_plan->count[i+1] : TRIP(&R->elem[i])/c_plan->count[i+1]; } c_plan->stridelevels = dvc->cnt+1; } if(c_plan->try_model) { c_plan->contig = c_plan->count[0]; c_plan->chunks = c_plan->count[1]; for(i=2; i <= c_plan->stridelevels; i++) c_plan->chunks = c_plan->chunks*c_plan->count[i]; } if(swap) { memcpy(c_plan->dststride, Lst, sizeof(size_t)*(c_plan->stridelevels+1)); memcpy(c_plan->srcstride, Rst, sizeof(size_t)*(c_plan->stridelevels+1)); } else { memcpy(c_plan->dststride, Rst, sizeof(size_t)*(c_plan->stridelevels+1)); memcpy(c_plan->srcstride, Lst, sizeof(size_t)*(c_plan->stridelevels+1)); } return 1; } /* This code can handle the following: 1. contiguous to N-dim data movement 2. N-d to N-d data movement (where the descriptors are the same) See Test_NStrideNxN for N-d to N-d reshaping. It does not handle arbitrary M-d to N-d reshaping. */ static int Test_NStride(upcrt_DimVec * dvc, upcrt_DimVec * redist, upcrt_StrideNTrans * c_plan, size_t esize) { int i; int sl, start, prev_stride; int reshape = 0; int scatter = 0; upcrt_DimVec *tmp = 0; char buf[sizeof(size_t)* UPCRT_MAX_DIM]; int n2n = 0; c_plan->try_model = 0; upcrt_assert((c_plan->type & UPCRT_T_TYPE_MASK) == UPCRT_T_STRIDEN); if(redist) { reshape = memcmp(&dvc->elem[0], &redist->elem[0], Elems((upcrt_Vec*)dvc)*sizeof(upcrt_Dim)); if(reshape == 0) { upcrt_assert(Elems((upcrt_Vec*)dvc) == Elems((upcrt_Vec*)redist)); reshape = 1; } else { if( (Elems((upcrt_Vec*)dvc) == 1) && (Elems((upcrt_Vec*)dvc) < Elems((upcrt_Vec*)redist))) { scatter = 1; tmp = dvc; dvc = redist; redist = tmp; /* swap the two dimvecs so the code that computes the stride works */ reshape = 0; } else { if(Elems((upcrt_Vec*)redist) != 1) { if(Elems((upcrt_Vec*)dvc) != Elems((upcrt_Vec*)redist)) { return Test_NStrideMxN(dvc, redist, c_plan, esize); } else { /* This is N-d to N-d redist, need to make sure that things divide evenly */ /* That would be span1[i]/stride1[i] == span2[i]/stride2[i] stride1[i] is a multiple stride2[i] In order to implement, pick ref1 as the reference with the smaller stride[0] Anyway, this situation is caused by loops, so let's live dangerously and leave the test unimplemented for the time being. */ if(dvc->elem[0].init_pos != redist->elem[0].init_pos) { /* This happens when lmads were coalesced. Not sure whether to check for all positions or not. Not even very sure how to detect this efficiently. Might be the case that we need to revert back to the original descriptors. Paranoia is good: should write a real check here to protect against buggy source level loops. */ for(i=1; i< dvc->cnt; i++) { upcrt_assert(dvc->elem[i].init_pos == redist->elem[i].init_pos); } return Test_NStrideNxN(dvc, redist, c_plan, esize); } else n2n = 1; } } } } } /* redist */ if(Elems((upcrt_Vec*) dvc) == 1 && !scatter) { upcrt_assert(dvc->elem[0].stride > 1); c_plan->count[0] = esize; c_plan->count[1] = dvc->elem[0].span/dvc->elem[0].stride; c_plan->srcstride[0] = dvc->elem[0].stride*esize; if(reshape) c_plan->dststride[0] = c_plan->srcstride[0]; else c_plan->dststride[0] = esize; c_plan->stridelevels = 1; } else { if(dvc->elem[0].stride != 1) { c_plan->count[0] = esize; c_plan->srcstride[0] = dvc->elem[0].stride * esize; if(reshape) c_plan->dststride[0] = c_plan->srcstride[0]; else c_plan->dststride[0] = esize; sl = 1; prev_stride = 1; for(i=sl; i <= Elems((upcrt_Vec*)dvc) - 1; i++) { if(dvc->elem[i].stride % prev_stride) return 0; prev_stride = dvc->elem[i].stride; c_plan->count[i] = dvc->elem[i-1].span/dvc->elem[i-1].stride; c_plan->srcstride[i] = dvc->elem[i+1].stride * esize; if(reshape) c_plan->dststride[i] = c_plan->srcstride[i]; else c_plan->dststride[i] = c_plan->dststride[i-1] * c_plan->count[i]; } c_plan->count[i] = dvc->elem[i-1].span/dvc->elem[i-1].stride; c_plan->stridelevels = i; c_plan->try_model = UPCRT_MODEL_PIPE; /* should be 0? */ } else { /* inner stride is 1 */ c_plan->count[0] = dvc->elem[0].span *esize; c_plan->srcstride[0] = dvc->elem[1].stride * esize; if(reshape) c_plan->dststride[0] = c_plan->srcstride[0]; else c_plan->dststride[0] = c_plan->count[0]; sl = 1; prev_stride = 1; for(i=1; i < Elems((upcrt_Vec*)dvc) - 1; i++, sl++) { if(dvc->elem[i].stride % prev_stride) return 0; prev_stride = dvc->elem[i].stride; c_plan->count[sl] = dvc->elem[i].span/prev_stride; c_plan->srcstride[sl] = dvc->elem[i+1].stride * esize; if(reshape) c_plan->dststride[sl] = c_plan->srcstride[sl]; else c_plan->dststride[sl] = c_plan->dststride[sl-1] * c_plan->count[sl]; } c_plan->count[i] = dvc->elem[i].span/dvc->elem[i].stride; c_plan->stridelevels = i; c_plan->try_model = UPCRT_MODEL_PIPE; } } if(n2n) { /* recompute dststride based on the information in redist */ if(redist->elem[0].stride != 1) { c_plan->dststride[0] = redist->elem[0].stride * esize; sl = 1; prev_stride = 1; for(i=sl; i <= Elems((upcrt_Vec*)redist) - 1; i++) { if(redist->elem[i].stride % prev_stride) return 0; prev_stride = redist->elem[i].stride; c_plan->dststride[i] = redist->elem[i+1].stride * esize; } //c_plan->try_model = 0; } else { /* inner stride is 1 */ c_plan->dststride[0] = redist->elem[1].stride * esize; sl = 1; prev_stride = 1; for(i=1; i < Elems((upcrt_Vec*)redist) - 1; i++, sl++) { if(redist->elem[i].stride % prev_stride) return 0; prev_stride = redist->elem[i].stride; c_plan->dststride[sl] = redist->elem[i+1].stride * esize; } } } if(c_plan->try_model) { c_plan->contig = c_plan->count[0]; c_plan->chunks = c_plan->count[1]; for(i=2; i <= c_plan->stridelevels; i++) c_plan->chunks = c_plan->chunks*c_plan->count[i]; } if (scatter) { upcrt_assert(n2n == 0); /* need to swap the dststride with srcstride */ memcpy(buf, c_plan->dststride, sizeof(size_t)*(c_plan->stridelevels)); memcpy(c_plan->dststride, c_plan->srcstride, sizeof(size_t)*(c_plan->stridelevels)); memcpy(c_plan->srcstride, buf, sizeof(size_t)*(c_plan->stridelevels)); } return 1; } static void Simplify_Ref(upcrt_RefDesc *ref) { UPCR_BEGIN_FUNCTION(); upcrt_LmadVec *lmads; int l; lmads = &ref->desc; for(l=0; l < Elems((upcrt_Vec*)lmads); l++) { Coalesce_Lmad_Dim(lmads->elem[l], &ref->comm); } lmads = &ref->comm; if(Elems((upcrt_Vec*)lmads) > 1) { Collapse_Lmads(lmads, ref); } } #define UPCRT_SIZE_IDX 0 #define UPCRT_B_IDX 1 UPCRV_INLINE(Apply_Model_1RS1) void Apply_Model_1RS1(int *strips, int *burst, int size, int Nref, int Nop, int local) { upcrt_pair *fast_eval; int index; if(local) { *strips = 1; *burst = 1; return; } else { if(Nref < UPCRT_NREF_COMM && Nop < UPCRT_NOP_COMM) { if(size < fast_eval_smallP[0][0]) { *strips = 1; *burst = 1; return; } if(upcr_threads() <= UPCRT_BW_PTHRESH && !upcrt_all2all) fast_eval = (upcrt_pair*)fast_eval_smallP; else fast_eval = (upcrt_pair*)fast_eval_largeP; index = Log2(size) - UPCRT_FIRST_LOG_DEC; if(index >= sizeof(fast_eval_smallP)/(2*sizeof(int))) { printf("Large volume not implemented yet\n"); upcr_global_exit(0); } else { upcrt_assert(size - fast_eval[index][UPCRT_SIZE_IDX] >= 0 && fast_eval[index][UPCRT_B_IDX] > 0); upcrt_assert(fast_eval[index+1][UPCRT_SIZE_IDX] - size >= 0 && fast_eval[index+1][UPCRT_B_IDX] > 0); upcrt_assert(fast_eval[index+1][UPCRT_SIZE_IDX] - fast_eval[index][UPCRT_SIZE_IDX] > 0); *strips = (int)(((double)size - (double)fast_eval[index][UPCRT_SIZE_IDX])* (double)fast_eval[index][UPCRT_B_IDX] + ((double)fast_eval[index+1][UPCRT_SIZE_IDX] - (double)size)* (double)fast_eval[index+1][UPCRT_B_IDX]) / ((double)fast_eval[index+1][UPCRT_SIZE_IDX] - (double)fast_eval[index][UPCRT_SIZE_IDX]); *burst = (*strips < UPCRT_MAX_BURST_LEN) ? *strips : UPCRT_MAX_BURST_LEN; if(upcrt_all2all /* || upcr_threads() > 32 */) { *burst = 3; } return; } /* large volume */ } else { printf("heavy comp not implemented yet \n"); upcr_global_exit(0); } } /* remote */ } void upcrt_Advance_Transfer_Contig(upcrt_ContigTrans *t) { UPCR_BEGIN_FUNCTION(); int i, stripsize; char *local; upcr_shared_ptr_t remote; if(upcr_hasMyAffinity_shared(t->remote)) return; if(t->nstrips == 1) { for(i = 0; i < t->active; i++) { bupc_waitsync(t->handles[i]); end_comm += (bupc_ticks_now() - start_comm); } t->active = 0; return; } if(t->init_comm - t->wait_comm == t->burst && t->init_comm < t->nstrips) { if(t->skip_init == t->stage_init + t->skip_sync || t->skip_init == t->stage_init + 1) { t->stage_init = 0; /* init next B ops */ local = (char*)upcr_shared_to_processlocal(t->local); stripsize = t->size/t->nstrips; local = local + stripsize*t->init_comm; remote = upcr_add_shared(t->remote, 1, stripsize*t->init_comm, t->size+1); for(i = 0; i < t->burst-1; i++) { if(t->init_comm == t->nstrips - 1) { break; } t->handles[t->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, remote, stripsize); local = local+stripsize; remote = upcr_add_shared(remote, 1, stripsize, t->size+1); t->init_comm++; } if(t->init_comm == t->nstrips-1) stripsize = t->size - stripsize*(t->nstrips-1); t->handles[t->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, remote, stripsize); t->init_comm++; } else { t->stage_init++; } } else { /* init next b ops */ t->stage_init++; } if(t->wait_comm < t->nstrips) { if(t->skip_sync - 1 == t->stage_sync) { bupc_waitsync(t->handles[t->wait_comm % UPCRT_QUEUE_DEPTH]); t->wait_comm++; t->stage_sync = 0; } else { t->stage_sync++; } } } void upcrt_Init_Read_Contig(upcrt_ContigTrans *t) { UPCR_BEGIN_FUNCTION(); int stripsize, i; char *local; upcr_shared_ptr_t remote; if(t->nstrips == 1) { t->handles[t->active] = bupc_memget_async(upcr_shared_to_local(t->local), t->remote, t->size); t->active++; return; } local = (char*)upcr_shared_to_processlocal(t->local); stripsize = t->size/t->nstrips; t->init_comm = 0; t->wait_comm = 0; for(i = 0; i < t->burst - 1; i++) { remote = upcr_add_shared(t->remote, 1, i*stripsize, t->size+1); t->handles[t->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, remote, stripsize); local = local+stripsize; t->init_comm++; } remote = upcr_add_shared(remote, 1, stripsize, t->size+1); if(t->burst == t->nstrips) stripsize = t->size - stripsize*(t->nstrips-1); t->handles[t->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, remote, stripsize); t->init_comm++; t->stage_init = t->skip_init-t->skip_sync; t->stage_sync = t->skip_sync-1; } int Determine_Sync_Level(upcrt_ContigTrans *t, upcrt_DimVec *desc, int esize) { int i, size, depth, skip, next_size, TSIZE, level; int done ; done = 0; if(t->nstrips == 1) { /* transfer not decomposed, should be finished at the top of the nest*/ t->skip_init =0; t->skip_sync = 0; return 0; } /* stride 1 is in pos 0 determine the level that covers the stripsize */ depth = desc->cnt-1; size = desc->elem[depth].span*esize; depth--; TSIZE = ((t->size/t->nstrips)*t->burst)/esize; while(!done) { next_size =desc->elem[depth].span; if( next_size > TSIZE) { if(depth == 0) { break; } if(desc->elem[depth].span/desc->elem[depth].stride > t->burst) { break; } depth--; } depth--; } level = desc->elem[depth+1].init_pos; if(desc->elem[level].span > TSIZE) { t->nstrips = t->size/(desc->elem[level].span*esize); /* Ideally I should reapply the model here, instead just make something up */ t->burst = 3; TSIZE = ((t->size/t->nstrips)*t->burst)/esize; } t->skip_init = TSIZE/desc->elem[level].span; t->stage_init = 0; t->skip_sync = ((t->size/t->nstrips)/esize)/desc->elem[level].span; t->stage_sync = 0; return level; } /*This is where most of the work is done.*/ /*The stages are:*/ /* 1. Try to collapse the dimensions of all the lmads in all the refs*/ /* 2. Sort the lmads for each ref in the increasing order of the remote address*/ /* and apply the contiguous test.*/ /* 3. Hopefully by now we get only a very reduced set of summary lmads. If there's*/ /* potential for intersection between the read/write sets, need to test for */ /* conflicts. Abort analysis and vect unless WL // RL. */ /* For the beginning I have a strong feeling that we abort unless they have only one dimension*/ /* 4. By now we know what exactly needs communication so we need to generate the communication*/ /* descriptors. */ /*Optimizations:*/ /* i. What to do for short loops?*/ /* ii. MOST IMPORTANT - upcrt_program phases get reexecuted, and each phase has*/ /* a static structure, i.e. even if the values change, the overall structure*/ /* (at least dims) stays the same. This means we can compute a plan for a*/ /* given loop structure and just perform the transformations in the plan instead*/ /* of blindly searching for matches. */ /* iii. Is there any way to overlap more of the analysis with the communication? */ int _upcrt_analyze_transfers( upcr_nest_descr_t ln) { UPCR_BEGIN_FUNCTION(); int result; upcrt_RefVec *refs; upcrt_LmadVec *lmads; upcrt_TransDesc *c_plan; int l,r, level; upcrt_LmadVec anal; upcrt_Lmad *cl, *desc; upcrt_DimVec *dvc, *dv_redist = NULL; upcrt_RefDesc *cr; int strips, burst, Nref, Nop; int local; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); end_desc += (bupc_ticks_now() - start_desc); start_anal = bupc_ticks_now(); upcrt_assert((*cur_nest) == &program->elem[ln]); result = 1; Nref = (*cur_nest)->Nref; Nop = (*cur_nest)->Nop; memset(&anal, 0, sizeof(upcrt_LmadVec)); /* if(program->elem[ln].isredist) */ /* return _upcrt_analyze_redist(ln); */ refs = &(*cur_nest)->refs; for(r = 0; r < Elems((upcrt_Vec*)refs); r++) { cr = &refs->elem[r]; if(cr->peer_ref == UPCRT_DEFAULT_PEER_REF) { Simplify_Ref(cr); lmads = &cr->comm; /* how do I test for success? */ /* now generate the communication */ for(l=0; l < Elems((upcrt_Vec*)lmads); l++) { cl = lmads->elem[l]; /*result += Elems((upcrt_Vec*)&cl->equiv_read) + Elems((upcrt_Vec*)&cl->equiv_write);*/ Update_Lmad_Span(cl); dvc = &cl->dvec; c_plan = (upcrt_TransDesc*)calloc(1,sizeof(upcrt_StrideNTrans)); c_plan->remote = cl->remote; c_plan->size = cl->total_span*cr->esize; c_plan->esize = cr->esize; if(cl->redist_ref == UPCRT_INVALID_REF) { /* TO DO: this leaks, needs to be freed at some point */ c_plan->local = upcr_local_alloc(cl->total_span,cr->esize); Set_TransDesc_Lifetime(c_plan, UPCRT_T_TRANSIENT_MEM); dv_redist = NULL; } else { /* the local address is in redist_ref->lmad[0].comm->remote */ Simplify_Ref(&refs->elem[cl->redist_ref]); c_plan->local = refs->elem[cl->redist_ref].comm.elem[0]->remote; dv_redist= &refs->elem[cl->redist_ref].comm.elem[l]->dvec; Set_TransDesc_Lifetime(c_plan, UPCRT_T_PERSISTENT_MEM); } if (Elems((upcrt_Vec*)dvc) == 1 && dvc->elem[0].stride == 1) { if(cl->redist_ref == UPCRT_INVALID_REF || (Elems((upcrt_Vec*)dv_redist) == 1 && dv_redist->elem[0].stride == 1)) { /* collapsed transfer */ switch(cl->type) { case UPCRT_REF_READ: start_comm = bupc_ticks_now(); local = upcr_hasMyAffinity_shared(c_plan->remote); level = 0; if (local) { upcr_memget(upcr_shared_to_processlocal(c_plan->local), c_plan->remote, c_plan->size); } else { c_plan->op_type = UPCRT_REF_READ; Apply_Model_1RS1(&((upcrt_ContigTrans*)c_plan)->nstrips, &((upcrt_ContigTrans*)c_plan)->burst, c_plan->size/sizeof(double), Nref, Nop, 0); /* if(upcr_mythread() == 0) { */ /* Print_LoopNest((*cur_nest)); */ /* } */ level = Determine_Sync_Level((upcrt_ContigTrans*)c_plan, &cr->desc.elem[l]->dvec, cr->esize); upcrt_Init_Read_Contig((upcrt_ContigTrans*)c_plan); } Add_Element((upcrt_Vec*)&(*cur_nest)->comm_read[level], &c_plan, sizeof(upcrt_TransDesc*)); break; case UPCRT_REF_WRITE: c_plan->op_type = UPCRT_REF_WRITE; Add_Element((upcrt_Vec*)&(*cur_nest)->comm_write[0], &c_plan, sizeof(upcrt_TransDesc*)); if(c_plan->type & UPCRT_T_PERSISTENT_MEM) Init_Comm_ContigWrite((upcrt_ContigTrans *)c_plan); break; case UPCRT_REF_RDWR: upcrt_err("Rd/WR not implemented yet\n"); break; default: upcrt_err("Unexpected lmad type \n"); } cl->local = c_plan->local; Set_TransDesc_Type(c_plan, UPCRT_T_CONTIG); Update_Peer_Local_Address(cl, cr, cl->local); upcrt_assert(c_plan->active < UPCRT_QUEUE_DEPTH); } else { /* redistribution with the target contiguous and the src strided */ goto REDIST; } } else { /* strided transfer */ REDIST: Set_TransDesc_Type(c_plan, UPCRT_T_STRIDEN); if(Test_NStride(dvc, dv_redist, (upcrt_StrideNTrans*)c_plan, cr->esize)) { switch(cl->type) { case UPCRT_REF_READ: c_plan->op_type = UPCRT_REF_READ; upcrt_Init_Comm_StrideNRead((upcrt_StrideNTrans *)c_plan); Add_Element((upcrt_Vec*)&(*cur_nest)->comm_read[0], &c_plan, sizeof(upcrt_TransDesc*)); break; case UPCRT_REF_WRITE: c_plan->op_type = UPCRT_REF_WRITE; Add_Element((upcrt_Vec*)&(*cur_nest)->comm_write[0], &c_plan, sizeof(upcrt_TransDesc*)); if(c_plan->type & UPCRT_T_PERSISTENT_MEM) { upcrt_Init_Comm_StrideNWrite((upcrt_StrideNTrans *)c_plan); } else { end_anal += (bupc_ticks_now() - start_anal); start_anal = bupc_ticks_now(); } break; case UPCRT_REF_RDWR: upcrt_err("Rd/WR not implemented yet\n"); break; default: upcrt_err("Unexpected lmad type \n"); } cl->local = c_plan->local; Set_TransDesc_Type(c_plan, UPCRT_T_STRIDEN); Update_Peer_Local_Address(cl, cr, cl->local); upcrt_assert(c_plan->active < UPCRT_QUEUE_DEPTH); } else { Print_LoopNest((*cur_nest)); upcrt_err("Multiple strides not implemented yet\n"); } } } } /* if DEFAULT_PEER */ } /* forall refs */ /* if(upcrt_print_targets && upcr_mythread() == UPCRT_DEBUG_THREAD) { */ /* Print_LoopNest((*cur_nest)); */ /* } */ return result; } void Analyze_and_Init_Redist(upcr_nest_descr_t ln) { UPCR_BEGIN_FUNCTION(); int i,j; int nelem; int active[UPCRT_MAX_DIM]; bupc_handle_t handles[UPCRT_MAX_DIM]; bupc_pmemvec_t Llist[UPCRT_MAX_DIM]; bupc_smemvec_t Rlist[UPCRT_MAX_DIM]; int Mcount = 0; int Hcount = 0; int last_insert = -1; int proto = UPCRT_S2P_PROTO_UNDEF; upcrt_StrideNTrans agg; upcrt_TransDesc *l, *r; upcrt_CommVec *cvec; upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); cvec = &(*cur_nest)->comm_read[0]; nelem = Elems((upcrt_Vec*)cvec); for(i=0; i < nelem; i++) active[i] = 1; upcrt_assert(nelem < UPCRT_MAX_DIM); for(i=0; i < nelem; i++) { l = (upcrt_TransDesc*)cvec->elem[i]; if(!active[i] || ((l->type & UPCRT_T_TYPE_MASK) != UPCRT_T_CONTIG) ) continue; upcrt_assert((l->type & UPCRT_T_TYPE_MASK) == UPCRT_T_CONTIG); /* do not combine large pieces */ proto = UPCRT_S2P_PROTO_UNDEF; if(upcrt_Coalesce_is_Slow(l->size/sizeof(double), nelem, bupc_thread_distance(upcr_mythread(), upcr_threadof_shared(l->remote)), &proto) ) { upcrt_assert(proto != UPCRT_S2P_PROTO_UNDEF); upcrt_assert(proto == UPCRT_S2P_PROTO_PIPE); handles[Hcount] = bupc_memget_async(upcr_shared_to_local(l->local), l->remote, l->size); Hcount++; continue; } for(j = i+1; j < nelem; j++) { r = (upcrt_TransDesc*)cvec->elem[j]; if(!active[j] || ((l->type & UPCRT_T_TYPE_MASK) != UPCRT_T_CONTIG)) continue; upcrt_assert ( (r->type & UPCRT_T_TYPE_MASK) == UPCRT_T_CONTIG); active[j] = 0; /* combine(l,r) */ if(last_insert != i) { /* add i */ Llist[Mcount].addr = upcr_shared_to_local(l->local); Llist[Mcount].len = l->size; Rlist[Mcount].addr = l->remote; Rlist[Mcount].len= l->size; Mcount++; last_insert = i; } /* add j */ Llist[Mcount].addr = upcr_shared_to_local(r->local); Llist[Mcount].len = r->size; Rlist[Mcount].addr = r->remote; Rlist[Mcount].len= r->size; Mcount++; } /* j */ } /* i */ /* init agg */ if(Mcount) { handles[Hcount] = bupc_memget_vlist_async(Mcount, Llist, Mcount, Rlist); Hcount++; } for(i=0; icnt = cvec->cnt - Hcount; cvec = &(*cur_nest)->comm_write[0]; nelem = Elems((upcrt_Vec*)cvec); Mcount = 0; for(i=0; i < nelem; i++) active[i] = 1; upcrt_assert(nelem < UPCRT_MAX_DIM); for(i=0; i < nelem; i++) { l = (upcrt_TransDesc*)cvec->elem[i]; if(!active[i] || ((l->type & UPCRT_T_TYPE_MASK) != UPCRT_T_CONTIG) ) continue; upcrt_assert((l->type & UPCRT_T_TYPE_MASK) == UPCRT_T_CONTIG); proto = UPCRT_S2P_PROTO_UNDEF; /* do not combine large pieces */ if(upcrt_Coalesce_is_Slow(l->size/sizeof(double), nelem, bupc_thread_distance(upcr_mythread(), upcr_threadof_shared(l->remote)), &proto) ) { upcrt_assert(proto != UPCRT_S2P_PROTO_UNDEF); upcrt_assert(proto == UPCRT_S2P_PROTO_PIPE); handles[Hcount] = bupc_memput_async(l->remote, upcr_shared_to_local(l->local), l->size); Hcount++; continue; } for(j = i+1; j < nelem; j++) { r = (upcrt_TransDesc*)cvec->elem[j]; if(!active[j] || ((l->type & UPCRT_T_TYPE_MASK) != UPCRT_T_CONTIG)) continue; upcrt_assert ( (r->type & UPCRT_T_TYPE_MASK) == UPCRT_T_CONTIG); active[j] = 0; /* combine(l,r) */ if(last_insert != i) { /* add i */ Llist[Mcount].addr = upcr_shared_to_local(l->local); Llist[Mcount].len = l->size; Rlist[Mcount].addr = l->remote; Rlist[Mcount].len= l->size; Mcount++; last_insert = i; } /* add j */ Llist[Mcount].addr = upcr_shared_to_local(r->local); Llist[Mcount].len = r->size; Rlist[Mcount].addr = r->remote; Rlist[Mcount].len= r->size; Mcount++; } /* j */ } /* i */ /* init agg */ if(Mcount) { handles[Hcount] = bupc_memput_vlist_async(Mcount, Rlist, Mcount, Llist); Hcount++; } for(i=0; icnt = cvec->cnt-Hcount; } /* Analyze_and_Init_Redist */ int _upcrt_analyze_redist( upcr_nest_descr_t ln) { UPCR_BEGIN_FUNCTION(); int result; upcrt_RefVec *refs; upcrt_LmadVec *lmads; upcrt_TransDesc *c_plan; int l,r; upcrt_LmadVec anal; upcrt_Lmad *cl; upcrt_DimVec *dvc, *dv_redist = NULL; upcrt_RefDesc *cr; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); int local; end_desc += (bupc_ticks_now() - start_desc); start_anal = bupc_ticks_now(); upcrt_assert((*cur_nest) == &program->elem[ln]); result = 1; memset(&anal, 0, sizeof(upcrt_LmadVec)); refs = &(*cur_nest)->refs; for(r = 0; r < Elems((upcrt_Vec*)refs); r++) { cr = &refs->elem[r]; if(cr->peer_ref == UPCRT_DEFAULT_PEER_REF) { Simplify_Ref(cr); lmads = &cr->comm; /* how do I test for success? */ /* now generate the communication */ for(l=0; l < Elems((upcrt_Vec*)lmads); l++) { cl = lmads->elem[l]; Update_Lmad_Span(cl); dvc = &cl->dvec; c_plan = (upcrt_TransDesc*)calloc(1,sizeof(upcrt_StrideNTrans)); c_plan->remote = cl->remote; c_plan->size = cl->total_span*cr->esize; c_plan->esize = cr->esize; if(cl->redist_ref == UPCRT_INVALID_REF) { /* TO DO: this leaks, needs to be freed at some point */ c_plan->local = upcr_local_alloc(cl->total_span,cr->esize); Set_TransDesc_Lifetime(c_plan, UPCRT_T_TRANSIENT_MEM); dv_redist = NULL; } else { /* the local address is in redist_ref->lmad[0].comm->remote */ Simplify_Ref(&refs->elem[cl->redist_ref]); c_plan->local = refs->elem[cl->redist_ref].comm.elem[0]->remote; dv_redist= &refs->elem[cl->redist_ref].comm.elem[l]->dvec; Set_TransDesc_Lifetime(c_plan, UPCRT_T_PERSISTENT_MEM); } if (Elems((upcrt_Vec*)dvc) == 1 && dvc->elem[0].stride == 1) { if(cl->redist_ref == UPCRT_INVALID_REF || (Elems((upcrt_Vec*)dv_redist) == 1 && dv_redist->elem[0].stride == 1)) { /* collapsed transfer */ switch(cl->type) { case UPCRT_REF_READ: start_comm = bupc_ticks_now(); local = upcr_hasMyAffinity_shared(c_plan->remote); if (local) { upcr_memget(upcr_shared_to_processlocal(c_plan->local), c_plan->remote, c_plan->size); } else { c_plan->op_type = UPCRT_REF_READ; } Add_Element((upcrt_Vec*)&(*cur_nest)->comm_read[0], &c_plan, sizeof(upcrt_TransDesc*)); break; case UPCRT_REF_WRITE: c_plan->op_type = UPCRT_REF_WRITE; Add_Element((upcrt_Vec*)&(*cur_nest)->comm_write[0], &c_plan, sizeof(upcrt_TransDesc*)); break; case UPCRT_REF_RDWR: upcrt_err("Rd/WR not implemented yet\n"); break; default: upcrt_err("Unexpected lmad type \n"); } cl->local = c_plan->local; Set_TransDesc_Type(c_plan, UPCRT_T_CONTIG); Update_Peer_Local_Address(cl, cr, cl->local); upcrt_assert(c_plan->active < UPCRT_QUEUE_DEPTH); } else { /* redistribution with the target contiguous and the src strided */ goto REDIST; } } else { /* strided transfer */ REDIST: Set_TransDesc_Type(c_plan, UPCRT_T_STRIDEN); if(Test_NStride(dvc, dv_redist, (upcrt_StrideNTrans*)c_plan, cr->esize)) { switch(cl->type) { case UPCRT_REF_READ: c_plan->op_type = UPCRT_REF_READ; upcrt_Init_Comm_StrideNRead((upcrt_StrideNTrans *)c_plan); Add_Element((upcrt_Vec*)&(*cur_nest)->comm_read[0], &c_plan, sizeof(upcrt_TransDesc*)); break; case UPCRT_REF_WRITE: c_plan->op_type = UPCRT_REF_WRITE; Add_Element((upcrt_Vec*)&(*cur_nest)->comm_write[0], &c_plan, sizeof(upcrt_TransDesc*)); if(c_plan->type & UPCRT_T_PERSISTENT_MEM) { upcrt_Init_Comm_StrideNWrite((upcrt_StrideNTrans *)c_plan); } break; case UPCRT_REF_RDWR: upcrt_err("Rd/WR not implemented yet\n"); break; default: upcrt_err("Unexpected lmad type \n"); } cl->local = c_plan->local; Set_TransDesc_Type(c_plan, UPCRT_T_STRIDEN); Update_Peer_Local_Address(cl, cr, cl->local); upcrt_assert(c_plan->active < UPCRT_QUEUE_DEPTH); } else { Print_LoopNest((*cur_nest)); upcrt_err("Multiple strides not implemented yet\n"); } } } } /* if DEFAULT_PEER */ } /* forall refs */ /* upcrt_print_targets = 1; */ /* if(upcrt_print_targets && upcr_mythread() == 0) { */ /* Print_LoopNest((*cur_nest)); */ /* upcrt_print_targets = 0; */ /* } */ Analyze_and_Init_Redist(ln); end_anal += (bupc_ticks_now() - start_anal); return result; } /*analyze_redist */ void upcrt_set_a2a(int val) { upcrt_all2all = val; } void upcrt_loop_1R( int LB, int UB, int stride, int type, upcr_shared_ptr_t remote, int esize, int Nref, int Nop) { UPCR_BEGIN_FUNCTION(); upcrt_TransDesc *t; start_desc = bupc_ticks_now(); total_calls++; if(stride == 1) t = *(upcrt_TransDesc**)UPCR_TLD_ADDR(upcrt_1RS1); else t = *(upcrt_TransDesc**)UPCR_TLD_ADDR(upcrt_1RSN); t->op_type = type; t->remote = remote; t->size = ((UB-LB+1)*esize)/stride; t->Nref = Nref; t->Nop = Nop; t->active = 0; t->esize = esize; } int _upcrt_analyze_1RS1(void) { UPCR_BEGIN_FUNCTION(); int i; upcr_pshared_ptr_t remote; char *local; int stripsize; int islocal; upcrt_ContigTrans *t_1RS1 = *(upcrt_ContigTrans**)UPCR_TLD_ADDR(upcrt_1RS1); start_anal = bupc_ticks_now(); end_desc += (start_anal - start_desc); remote = upcr_shared_to_pshared(t_1RS1->remote); islocal = upcr_hasMyAffinity_pshared(remote); Apply_Model_1RS1(&t_1RS1->nstrips, &t_1RS1->burst, t_1RS1->size/sizeof(double), t_1RS1->Nref, t_1RS1->Nop, upcr_hasMyAffinity_pshared(remote)); if(islocal) { t_1RS1->local = upcr_pshared_to_shared(remote); return 1; } else t_1RS1->local = upcr_local_alloc(t_1RS1->size, 1); stripsize = t_1RS1->size/t_1RS1->nstrips; t_1RS1->init_comm = 0; t_1RS1->wait_comm = 0; local = (char*)upcr_shared_to_processlocal(t_1RS1->local); end_anal += (bupc_ticks_now() - start_anal); if(UPCRT_REF_READ == t_1RS1->op_type){ for(i = 0; i < t_1RS1->burst - 1; i++) { t_1RS1->handles[t_1RS1->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, upcr_pshared_to_shared(remote), stripsize); local = local+stripsize; remote = upcr_add_psharedI(remote, 1, stripsize); t_1RS1->init_comm++; } if(t_1RS1->burst == t_1RS1->nstrips) stripsize = t_1RS1->size - stripsize*(t_1RS1->nstrips-1); t_1RS1->handles[t_1RS1->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, upcr_pshared_to_shared(remote), stripsize); t_1RS1->init_comm++; }/* REF READ */ return 1; } int upcrt_strips_1RS1(void) { UPCR_BEGIN_FUNCTION(); upcrt_ContigTrans *t_1RS1 = *(upcrt_ContigTrans**)UPCR_TLD_ADDR(upcrt_1RS1); return (t_1RS1->size/(t_1RS1->nstrips*t_1RS1->esize)); } void _upcrt_advance_1RS1(void) { UPCR_BEGIN_FUNCTION(); int i, j; upcr_shared_ptr_t remote, rem; char *local; int stripsize; upcrt_ContigTrans *t_1RS1 = *(upcrt_ContigTrans**)UPCR_TLD_ADDR(upcrt_1RS1); if(upcr_hasMyAffinity_shared(t_1RS1->remote)) return; switch(t_1RS1->op_type) { case UPCRT_REF_READ: if(t_1RS1->init_comm - t_1RS1->wait_comm == t_1RS1->burst && t_1RS1->init_comm < t_1RS1->nstrips) { local = (char*)upcr_shared_to_processlocal(t_1RS1->local); stripsize = t_1RS1->size/t_1RS1->nstrips; local = local + stripsize*t_1RS1->init_comm; remote = upcr_add_shared(t_1RS1->remote, 1, stripsize*t_1RS1->init_comm, t_1RS1->size+1); for(i = 0; i < t_1RS1->burst-1; i++) { rem = upcr_add_shared(remote, 1, i*stripsize, t_1RS1->size+1); if(t_1RS1->init_comm == t_1RS1->nstrips - 1) break; t_1RS1->handles[t_1RS1->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, rem, stripsize); local = local+stripsize; t_1RS1->init_comm++; } remote = upcr_add_shared(remote, 1, stripsize, t_1RS1->size+1); if(t_1RS1->init_comm == t_1RS1->nstrips-1) stripsize = t_1RS1->size - stripsize*(t_1RS1->nstrips-1); t_1RS1->handles[t_1RS1->init_comm % UPCRT_QUEUE_DEPTH] = bupc_memget_async(local, remote, stripsize); t_1RS1->init_comm++; } /* init next b ops */ if(t_1RS1->wait_comm < t_1RS1->nstrips) { bupc_waitsync(t_1RS1->handles[t_1RS1->wait_comm % UPCRT_QUEUE_DEPTH]); t_1RS1->wait_comm++; } break; case UPCRT_REF_WRITE: printf("WRITES NOT IMPLEMENTED YET !\n"); upcr_global_exit(-1); break; default: upcrt_assert(0); } } void _upcrt_finalize_1RS1(void) { UPCR_BEGIN_FUNCTION(); upcrt_ContigTrans *t_1RS1 = *(upcrt_ContigTrans**)UPCR_TLD_ADDR(upcrt_1RS1); if(UPCRT_REF_WRITE == t_1RS1->op_type) { printf("WRITES NOT IMPLEMENTED YET !\n"); upcr_global_exit(-1); } if(!upcr_hasMyAffinity_shared(t_1RS1->remote)) upcr_free(t_1RS1->local); } void * upcrt_get_address_1RS1(void) { UPCR_BEGIN_FUNCTION(); upcrt_ContigTrans *t_1RS1 = *(upcrt_ContigTrans**)UPCR_TLD_ADDR(upcrt_1RS1); return upcr_shared_to_processlocal(t_1RS1->local); } void upcrt_set_redist(upcr_nest_descr_t ln) { UPCR_BEGIN_FUNCTION(); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); (*cur_nest)->isredist = 1; } void upcrt_print_vect_stats(void) { UPCR_BEGIN_FUNCTION(); printf("%d: CALLS = %d, DESC_TIME = %lld (us), TPD = %lf, ANAL_TIME = %lld (us), TPA = %lf \n", upcr_mythread(), total_calls, bupc_ticks_to_us(end_desc), (double)bupc_ticks_to_us(end_desc)/(double)total_calls, bupc_ticks_to_us(end_anal), (double)bupc_ticks_to_us(end_anal)/(double)total_calls ); } //////////////////////////////////////////////// void* _upcrt_get_local_address(upcr_nest_descr_t ln, upcr_ref_descr_t lref, upcr_lmad_descr_t lmad) { UPCR_BEGIN_FUNCTION(); upcr_shared_ptr_t laddr; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_assert((*cur_nest) == &program->elem[ln]); upcrt_assert(((*cur_nest)->refs.elem[lref].desc.elem[lmad]->type == UPCRT_REF_READ && Elems((upcrt_Vec*)&(*cur_nest)->refs.elem[lref].desc.elem[lmad]->equiv_read) == 1) || ((*cur_nest)->refs.elem[lref].desc.elem[lmad]->type == UPCRT_REF_WRITE && Elems((upcrt_Vec*)&(*cur_nest)->refs.elem[lref].desc.elem[lmad]->equiv_write) == 1) ); return upcr_shared_to_processlocal((*cur_nest)->refs.elem[lref].desc.elem[lmad]->local); } void upcrt_add_polytope_dim(upcr_nest_descr_t nest, long depth, long lbd, long ubnd, long stride) { UPCR_BEGIN_FUNCTION(); int pos; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); pos = Elems((upcrt_Vec*)&(*cur_nest)->bounds); upcrt_assert((*cur_nest) == &program->elem[nest]); upcrt_assert(pos == depth); upcrt_assert(pos < UPCRT_MAX_DIM); (*cur_nest)->bounds.elem[depth].lb = lbd; (*cur_nest)->bounds.elem[depth].ub = ubnd; (*cur_nest)->bounds.elem[depth].stride = stride; Inc_Cnt((upcrt_Vec*)&(*cur_nest)->bounds); } upcr_ref_descr_t upcrt_new_base_ref(upcr_nest_descr_t nest, int alias, size_t esize) { UPCR_BEGIN_FUNCTION(); int pos; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_RefDesc **cur_ref = (upcrt_RefDesc**)UPCR_TLD_ADDR(upcrt_cur_ref); pos = Elems((upcrt_Vec*)&(*cur_nest)->refs); (*cur_ref) = &(*cur_nest)->refs.elem[pos]; upcrt_assert((*cur_nest) == &program->elem[nest]); upcrt_assert(pos < UPCRT_MAX_DIM); upcrt_assert(alias >= 0 && alias <= 3); /*TO DO - replace this when I figure */ /*out how to really deal with aliases*/ memset((*cur_ref), 0, sizeof(upcrt_RefDesc)); (*cur_ref)->alias = alias; (*cur_ref)->esize = esize; (*cur_ref)->peer_ref = UPCRT_DEFAULT_PEER_REF; Inc_Cnt((upcrt_Vec*)&(*cur_nest)->refs); return pos; } upcr_lmad_descr_t upcrt_new_lmad(upcr_nest_descr_t nest, upcr_ref_descr_t ref, upcr_shared_ptr_t base, int type) { UPCR_BEGIN_FUNCTION(); int pos; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_RefDesc **cur_ref = (upcrt_RefDesc**)UPCR_TLD_ADDR(upcrt_cur_ref); upcrt_Lmad **cur_lmad = (upcrt_Lmad**)UPCR_TLD_ADDR(upcrt_cur_lmad); upcrt_DimVec **cur_dim = (upcrt_DimVec**)UPCR_TLD_ADDR(upcrt_cur_dim); upcrt_assert((*cur_nest) == &program->elem[nest]); upcrt_assert((*cur_ref) == &(*cur_nest)->refs.elem[ref]); upcrt_assert(Elems((upcrt_Vec*)&(*cur_ref)->desc) < UPCRT_MAX_DIM); upcrt_assert(type == UPCRT_REF_READ || type == UPCRT_REF_WRITE); pos = Elems((upcrt_Vec*)&(*cur_ref)->desc); (*cur_lmad) = calloc(1,sizeof(upcrt_Lmad)); (*cur_ref)->desc.elem[pos] = (*cur_lmad); (*cur_lmad)->remote = base; (*cur_lmad)->init_pos = pos; (*cur_lmad)->type = type; (*cur_dim) = &(*cur_lmad)->dvec; (*cur_lmad)->total_span = 1; (*cur_lmad)->redist_ref = UPCRT_INVALID_REF; Inc_Cnt((upcrt_Vec*)&(*cur_ref)->desc); return pos; } upcr_lmad_descr_t _upcrt_new_lmad_local(upcr_nest_descr_t nest, upcr_ref_descr_t ref, void *base, int type) { UPCR_BEGIN_FUNCTION(); int pos; upcr_shared_ptr_t ptr = upcr_mylocal_to_shared_withphase(base, 0, upcr_mythread()); /* if(upcrt_print_targets && upcr_mythread() == 1) */ /* printf("CALL LOCAL WITH %p\n", base); */ return upcrt_new_lmad(nest, ref, ptr, type); } upcr_ref_descr_t upcrt_new_redist_ref(upcr_nest_descr_t nest, upcr_ref_descr_t peer_ref, upcr_lmad_descr_t peer_lmad, int alias, size_t esize) { UPCR_BEGIN_FUNCTION(); int pos; upcrt_RefDesc *pr; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_RefDesc **cur_ref = (upcrt_RefDesc**)UPCR_TLD_ADDR(upcrt_cur_ref); pos = Elems((upcrt_Vec*)&(*cur_nest)->refs); (*cur_ref) = &(*cur_nest)->refs.elem[pos]; upcrt_assert((*cur_nest) == &program->elem[nest]); upcrt_assert(pos < UPCRT_MAX_DIM); upcrt_assert(alias >= 0 && alias <= 3); /*TO DO - replace this when I figure */ /*out how to really deal with aliases*/ memset((*cur_ref), 0, sizeof(upcrt_RefDesc)); (*cur_ref)->alias = alias; (*cur_ref)->esize = esize; (*cur_ref)->type = UPCRT_REDIST_REF; (*cur_ref)->peer_ref = peer_ref; pr = &(*cur_nest)->refs.elem[peer_ref]; pr->desc.elem[peer_lmad]->redist_ref = pos; Inc_Cnt((upcrt_Vec*)&(*cur_nest)->refs); return pos; } void _upcrt_advance_dim(upcr_nest_descr_t ln, int dim) { UPCR_BEGIN_FUNCTION(); int i; upcrt_TransDesc *t; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); upcrt_assert((*cur_nest) == &program->elem[ln]); upcrt_assert(dim < UPCRT_MAX_DIM); for(i=0; i < Elems((upcrt_Vec*)&(*cur_nest)->comm_read[dim]); i++) { t = (*cur_nest)->comm_read[dim].elem[i]; switch(t->type & UPCRT_T_TYPE_MASK) { case UPCRT_T_CONTIG: upcrt_Advance_Transfer_Contig((upcrt_ContigTrans*)t); break; case UPCRT_T_STRIDEN: Advance_Transfer(t); break; case UPCRT_T_FSTRIDE: case UPCRT_T_ILIST: default: upcrt_err("Transfer type %d not implemented yet\n", t->type & UPCRT_T_TYPE_MASK); break; } } } void _upcrt_start_transfers(upcr_nest_descr_t nest) { //UPCR_BEGIN_FUNCTION(); } upcr_nest_descr_t upcrt_start_nest(upcr_key_t key, int isredist) { UPCR_BEGIN_FUNCTION(); int result; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); start_desc = bupc_ticks_now(); total_calls++; result = Elems((upcrt_Vec*)program); upcrt_assert(result < UPCRT_MAX_DIM); (*cur_nest) = &program->elem[result]; memset((*cur_nest), 0, sizeof(upcrt_LoopNest)); (*cur_nest)->isredist = isredist; (*cur_nest)->Nref = 1; (*cur_nest)->Nop = 1; Inc_Cnt((upcrt_Vec*)program); return result; } void _upcrt_end_nest( upcr_nest_descr_t ln) { UPCR_BEGIN_FUNCTION(); int i; int j; int n_elem, n_trans, level; upcrt_CommVec *c_op; upcrt_TransDesc **c_trans; upcrt_LoopNestVec *program = *(upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_LoopNest **cur_nest = (upcrt_LoopNest**)UPCR_TLD_ADDR(upcrt_cur_nest); (*cur_nest) = &program->elem[ln]; for(level = 0; level < (*cur_nest)->bounds.cnt; level++) { n_elem = Elems((upcrt_Vec*)&(*cur_nest)->comm_read[level]); c_op = &(*cur_nest)->comm_read[level]; for(i = 0; i < n_elem; i++, c_op++) { n_trans = Elems((upcrt_Vec*)c_op); for(j = 0 ; j < n_trans; j++) { if(c_op->elem[j]->type & UPCRT_T_TRANSIENT_MEM) { upcr_free(c_op->elem[j]->local); } } } n_elem = Elems((upcrt_Vec*)&(*cur_nest)->comm_write[level]); c_op = &(*cur_nest)->comm_write[level]; for(i = 0; i < n_elem; i++, c_op++) { n_trans = Elems((upcrt_Vec*)c_op); for(j = 0 ; j < n_trans; j++) { if(c_op->elem[j]->type & UPCRT_T_TRANSIENT_MEM) { upcr_free(c_op->elem[j]->local); } } } } /* the only dynamic data so far is the transfer descriptor */ Dec_Cnt((upcrt_Vec*)program); } void upcrt_vect_thread_init(void) { UPCR_BEGIN_FUNCTION(); upcrt_LoopNestVec **program = (upcrt_LoopNestVec **)UPCR_TLD_ADDR(upcrt_program); upcrt_ContigTrans **t_1RS1 = (upcrt_ContigTrans **)UPCR_TLD_ADDR(upcrt_1RS1); upcrt_FstrideTrans **t_1RSN = (upcrt_FstrideTrans **)UPCR_TLD_ADDR(upcrt_1RSN); *program = (upcrt_LoopNestVec*)calloc(sizeof(upcrt_LoopNestVec), 1); *t_1RS1 = (upcrt_ContigTrans*)calloc(sizeof(upcrt_ContigTrans), 1); *t_1RSN = (upcrt_FstrideTrans*)calloc(sizeof(upcrt_FstrideTrans), 1); } #endif /*VECT_C*/