PAPI 7.1.0.0
Loading...
Searching...
No Matches
dcache.c
Go to the documentation of this file.
1#include "papi.h"
2#include "caches.h"
3#include "timing_kernels.h"
4#include "dcache.h"
5#include "params.h"
6#include <math.h>
7
8static void print_header(FILE *ofp_papi, hw_desc_t *hw_desc);
9static void print_cache_sizes(FILE *ofp_papi, hw_desc_t *hw_desc);
10static void print_core_affinities(FILE *ofp);
11
12extern char* eventname;
13
14long long min_size, max_size;
15int is_core = 0;
16
17void d_cache_driver(char* papi_event_name, cat_params_t params, hw_desc_t *hw_desc, int latency_only, int mode)
18{
19 int pattern = 3;
20 long long stride;
21 int f, cache_line;
22 int status, evtCode, test_cnt = 0;
23 float ppb = 16;
24 FILE *ofp_papi;
25 char *sufx, *papiFileName;
26
27 // Use component ID to check if event is a core event.
28 if( strcmp(papi_event_name, "cat::latencies") && PAPI_OK != (status = PAPI_event_name_to_code(papi_event_name, &evtCode)) ) {
29 error_handler(status, __LINE__);
30 } else {
31 if( 0 == PAPI_get_event_component(evtCode) )
32 is_core = 1;
33 }
34
35 // Open file (pass handle to d_cache_test()).
36 if(CACHE_READ_WRITE == mode){
37 sufx = strdup(".data.writes");
38 }else{
39 sufx = strdup(".data.reads");
40 }
41
42 int l = strlen(params.outputdir)+strlen(papi_event_name)+strlen(sufx);
43 papiFileName = (char *)calloc( 1+l, sizeof(char) );
44 if (!papiFileName) {
45 fprintf(stderr, "Unable to allocate memory. Skipping event %s.\n", papi_event_name);
46 goto error0;
47 }
48 if (l != (sprintf(papiFileName, "%s%s%s", params.outputdir, papi_event_name, sufx))) {
49 fprintf(stderr, "sprintf error. Skipping event %s.\n", papi_event_name);
50 goto error1;
51 }
52 if (NULL == (ofp_papi = fopen(papiFileName,"w"))) {
53 fprintf(stderr, "Unable to open file %s. Skipping event %s.\n", papiFileName, papi_event_name);
54 goto error1;
55 }
56
57 if( (NULL==hw_desc) || (0==hw_desc->dcache_line_size[0]) )
58 cache_line = 64;
59 else
60 cache_line = hw_desc->dcache_line_size[0];
61
62 // Print meta-data about this run in the first few lines of the output file.
63 print_header(ofp_papi, hw_desc);
64
65 // Go through each parameter variant.
66 for(pattern = 3; pattern <= 4; ++pattern)
67 {
68 for(f = 1; f <= 2; f *= 2)
69 {
70 stride = cache_line*f;
71 // PPB variation only makes sense if the pattern is not sequential.
72 if(pattern != 4)
73 {
74 for(ppb = (float)hw_desc->maxPPB; ppb >= 16; ppb *= 16.0/(hw_desc->maxPPB))
75 {
76 if( params.show_progress )
77 {
78 printf("%3d%%\b\b\b\b",(100*test_cnt++)/6);
79 fflush(stdout);
80 }
81 status = d_cache_test(pattern, params.max_iter, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi);
82 if( status < 0 )
83 goto error2;
84 }
85 }
86 else
87 {
88 if( params.show_progress )
89 {
90 printf("%3d%%\b\b\b\b",(100*test_cnt++)/6);
91 fflush(stdout);
92 }
93 status = d_cache_test(pattern, params.max_iter, hw_desc, stride, ppb, papi_event_name, latency_only, mode, ofp_papi);
94 if( status < 0 )
95 goto error2;
96 }
97 }
98 }
99error2:
100 if( params.show_progress )
101 {
102 size_t i;
103 printf("100%%");
104 for(i=0; i<strlen("Total:100% Current test:100%"); i++) putchar('\b');
105 fflush(stdout);
106 }
107
108 // Close files and free memory.
109 fclose(ofp_papi);
110error1:
111 free(papiFileName);
112error0:
113 free(sufx);
114
115 return;
116}
117
118int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char* papi_event_name, int latency_only, int mode, FILE* ofp){
119 int i,j,k;
120 long long *values;
121 double ***rslts, *sorted_rslts;
122 double ***counter, *sorted_counter;
123 int status=0, guessCount, ONT;
124
125 min_size = 2*1024/sizeof(uintptr_t); // 2KB
126 max_size = 1024*1024*1024/sizeof(uintptr_t);// 1GB
127
128 // The number of different sizes we will guess, trying to find the right size.
129 guessCount = 0;
130 if( (NULL==hw_desc) || (hw_desc->cache_levels<=0) ){
131 for(i=min_size; i<max_size; i*=2){
132 // += 4 for i, i*1.25, i*1.5, i*1.75
133 guessCount += 4;
134 }
135 }else{
136 int numCaches = hw_desc->cache_levels;
137 for(j=0; j<numCaches; ++j) {
138 guessCount += hw_desc->pts_per_reg[j];
139 }
140 guessCount += hw_desc->pts_per_mm;
141
142 int llc_idx = hw_desc->cache_levels-1;
143 int num_pts = hw_desc->pts_per_mm+1;
144 double factor = pow((double)FACTOR, ((double)(num_pts-1))/((double)num_pts));
145 max_size = factor*(hw_desc->dcache_size[llc_idx])/hw_desc->mmsplit;
146 }
147
148 // Get the number of threads.
149 ONT = get_thread_count();
150
151 // Latency results from the benchmark.
152 rslts = (double ***)malloc(max_iter*sizeof(double **));
153 for(i=0; i<max_iter; ++i){
154 rslts[i] = (double **)malloc(guessCount*sizeof(double*));
155 for(j=0; j<guessCount; ++j){
156 rslts[i][j] = (double *)malloc(ONT*sizeof(double));
157 }
158 }
159 sorted_rslts = (double *)malloc(max_iter*sizeof(double));
160
161 // Counter results from the benchmark.
162 counter = (double ***)malloc(max_iter*sizeof(double **));
163 for(i=0; i<max_iter; ++i){
164 counter[i] = (double **)malloc(guessCount*sizeof(double*));
165 for(j=0; j<guessCount; ++j){
166 counter[i][j] = (double *)malloc(ONT*sizeof(double));
167 }
168 }
169 sorted_counter = (double *)malloc(max_iter*sizeof(double));
170
171 // List of buffer sizes which are used in the benchmark.
172 values = (long long *)malloc(guessCount*sizeof(long long));
173
174 // Set the name of the event to be monitored during the benchmark.
175 eventname = papi_event_name;
176
177 for(i=0; i<max_iter; ++i){
178 status = varyBufferSizes(values, rslts[i], counter[i], hw_desc, stride_in_bytes, pages_per_block, pattern, latency_only, mode, ONT);
179 if( status < 0 )
180 goto cleanup;
181 }
182
183 // Sort and print latency and counter results.
184 fprintf(ofp, "# PTRN=%d, STRIDE=%lld, PPB=%f, ThreadCount=%d\n", pattern, stride_in_bytes, pages_per_block, ONT);
185
186 if(latency_only) {
187
188 for(j=0; j<guessCount; ++j){
189 fprintf(ofp, "%lld", values[j]);
190 for(k=0; k<ONT; ++k){
191 for(i=0; i<max_iter; ++i){
192 sorted_rslts[i] = rslts[i][j][k];
193 }
194 qsort(sorted_rslts, max_iter, sizeof(double), compar_lf);
195 fprintf(ofp, " %.4lf", sorted_rslts[0]);
196 }
197 fprintf(ofp, "\n");
198 }
199
200 } else {
201
202 for(j=0; j<guessCount; ++j){
203 fprintf(ofp, "%lld", values[j]);
204 for(k=0; k<ONT; ++k){
205 for(i=0; i<max_iter; ++i){
206 sorted_counter[i] = counter[i][j][k];
207 }
208 qsort(sorted_counter, max_iter, sizeof(double), compar_lf);
209 fprintf(ofp, " %lf", sorted_counter[0]);
210 }
211 fprintf(ofp, "\n");
212 }
213 }
214
215cleanup:
216 for(i=0; i<max_iter; ++i){
217 for(j=0; j<guessCount; ++j){
218 free(rslts[i][j]);
219 free(counter[i][j]);
220 }
221 free(rslts[i]);
222 free(counter[i]);
223 }
224 free(rslts);
225 free(counter);
226 free(sorted_rslts);
227 free(sorted_counter);
228 free(values);
229
230 return status;
231}
232
233
234int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT){
235 long long i;
236 int j, k, cnt;
237 long long active_buf_len;
238 int allocErr = 0;
239 run_output_t out;
240
241 long long stride = stride_in_bytes/sizeof(uintptr_t);
242
243 uintptr_t rslt=42, *v[ONT], *ptr[ONT];
244
245 // Allocate memory for each thread to traverse.
246 #pragma omp parallel private(i) reduction(+:rslt) default(shared)
247 {
248 int idx = omp_get_thread_num();
249
250 ptr[idx] = (uintptr_t *)malloc( (2LL*max_size+stride)*sizeof(uintptr_t) );
251 if( !ptr[idx] ){
252 fprintf(stderr, "Error: cannot allocate space for experiment.\n");
253 #pragma omp critical
254 {
255 allocErr = -1;
256 }
257 }else{
258 // align v to the stride.
259 v[idx] = (uintptr_t *)(stride_in_bytes*(((uintptr_t)ptr[idx]+stride_in_bytes)/stride_in_bytes));
260
261 // touch every page at least a few times
262 for(i=0; i<2LL*max_size; i+=512LL){
263 rslt += v[idx][i];
264 }
265 }
266 }
267 if(allocErr != 0)
268 {
269 goto error;
270 }
271
272 // Make a cold run
273 out = probeBufferSize(16LL*stride, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
274 if(out.status != 0)
275 goto error;
276
277 // Run the actual experiment
278 if( (NULL==hw_desc) || (hw_desc->cache_levels<=0) ){
279 cnt = 0;
280 // If we don't know the cache sizes, space the measurements between two default values.
281 for(active_buf_len=min_size; active_buf_len<max_size; active_buf_len*=2){
282 out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
283 if(out.status != 0)
284 goto error;
285 for(k = 0; k < ONT; ++k) {
286 rslts[cnt][k] = out.dt[k];
287 counter[cnt][k] = out.counter[k];
288 }
289 values[cnt++] = ONT*sizeof(uintptr_t)*active_buf_len;
290
291 out = probeBufferSize((long long)((double)active_buf_len*1.25), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
292 if(out.status != 0)
293 goto error;
294 for(k = 0; k < ONT; ++k) {
295 rslts[cnt][k] = out.dt[k];
296 counter[cnt][k] = out.counter[k];
297 }
298 values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.25));
299
300 out = probeBufferSize((long long)((double)active_buf_len*1.5), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
301 if(out.status != 0)
302 goto error;
303 for(k = 0; k < ONT; ++k) {
304 rslts[cnt][k] = out.dt[k];
305 counter[cnt][k] = out.counter[k];
306 }
307 values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.5));
308
309 out = probeBufferSize((long long)((double)active_buf_len*1.75), stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
310 if(out.status != 0)
311 goto error;
312 for(k = 0; k < ONT; ++k) {
313 rslts[cnt][k] = out.dt[k];
314 counter[cnt][k] = out.counter[k];
315 }
316 values[cnt++] = ONT*sizeof(uintptr_t)*((long long)((double)active_buf_len*1.75));
317 }
318 }else{
319 double f;
320 int numCaches = hw_desc->cache_levels;
321 int numHier = numCaches+1;
322 int llc_idx = numCaches-1;
323 int len = 0, ptsToNextCache, tmpIdx = 0;
324 long long currCacheSize, nextCacheSize;
325 long long *bufSizes;
326
327 // Calculate the length of the array of buffer sizes.
328 for(j=0; j<numCaches; ++j) {
329 len += hw_desc->pts_per_reg[j];
330 }
331 len += hw_desc->pts_per_mm;
332
333 // Allocate space for the array of buffer sizes.
334 if( NULL == (bufSizes = (long long *)calloc(len, sizeof(long long))) )
335 goto error;
336
337 // Define buffer sizes.
338 tmpIdx = 0;
339 for(j=0; j<numHier; ++j) {
340
341 /* The lower bound of the first cache region is set to the size, L1/8, as a design decision.
342 * All other lower bounds are set to the size of the caches, as observed per core.
343 */
344 if( 0 == j ) {
345 currCacheSize = hw_desc->dcache_size[0]/(8.0*hw_desc->split[0]);
346 } else {
347 currCacheSize = hw_desc->dcache_size[j-1]/hw_desc->split[j-1];
348 }
349
350 /* The upper bound of the final "cache" region (memory in this case) is set to FACTOR times the
351 * size of the LLC so that all threads cumulatively will exceed the LLC by a factor of FACTOR.
352 * All other upper bounds are set to the capacity of the cache, as observed per core.
353 */
354 if( llc_idx+1 == j ) {
355 nextCacheSize = 16LL*(hw_desc->dcache_size[llc_idx])/hw_desc->mmsplit;
356 ptsToNextCache = hw_desc->pts_per_mm+1;
357 } else {
358 nextCacheSize = hw_desc->dcache_size[j]/hw_desc->split[j];
359 ptsToNextCache = hw_desc->pts_per_reg[j]+1;
360 }
361
362 /* Choose a factor "f" to grow the buffer size by, such that we collect the user-specified
363 * number of samples between each cache size, evenly distributed in a geometric fashion
364 * (i.e., sizes will be equally spaced in a log graph).
365 */
366 for(k = 1; k < ptsToNextCache; ++k) {
367 f = pow(((double)nextCacheSize)/currCacheSize, ((double)k)/ptsToNextCache);
368 bufSizes[tmpIdx+k-1] = f*currCacheSize;
369 }
370
371 if( llc_idx+1 == j ) {
372 tmpIdx += hw_desc->pts_per_mm;
373 } else {
374 tmpIdx += hw_desc->pts_per_reg[j];
375 }
376 }
377
378 cnt=0;
379 for(j=0; j<len; j++){
380 active_buf_len = bufSizes[j]/sizeof(uintptr_t);
381 out = probeBufferSize(active_buf_len, stride, pages_per_block, pattern, v, &rslt, latency_only, mode, ONT);
382 if(out.status != 0)
383 goto error;
384 for(k = 0; k < ONT; ++k) {
385 rslts[cnt][k] = out.dt[k];
386 counter[cnt][k] = out.counter[k];
387 }
388 values[cnt++] = bufSizes[j];
389 }
390
391 free(bufSizes);
392 }
393
394 // Free each thread's memory.
395 for(j=0; j<ONT; ++j){
396 free(ptr[j]);
397 }
398 return 0;
399
400error:
401 // Free each thread's memory.
402 for(j=0; j<ONT; ++j){
403 free(ptr[j]);
404 }
405 return -1;
406}
407
409
410 int threadNum = 1;
411
412 #pragma omp parallel default(shared)
413 {
414 if(!omp_get_thread_num()) {
415 threadNum = omp_get_num_threads();
416 }
417 }
418
419 return threadNum;
420}
421
422void print_header(FILE *ofp, hw_desc_t *hw_desc){
423 // Print the core to which each thread is pinned.
425 // Print the size of each cache divided by the number of cores that share it.
426 print_cache_sizes(ofp, hw_desc);
427}
428
429void print_cache_sizes(FILE *ofp, hw_desc_t *hw_desc){
430 int i;
431
432 fprintf(ofp, "#");
433
434 if( NULL == hw_desc ) {
435 fprintf(ofp, "\n");
436 return;
437 }
438
439 for(i=0; i<hw_desc->cache_levels; ++i) {
440 long long sz = hw_desc->dcache_size[i]/hw_desc->split[i];
441 fprintf(ofp, " L%d:%lld", i+1, sz);
442 }
443 fprintf(ofp, "\n");
444
445}
446
447void print_core_affinities(FILE *ofp) {
448
449 int k, ONT;
450 int *pinnings = NULL;
451
452 // Get the number of threads.
453 ONT = get_thread_count();
454
455 // List of core affinities in which the index is the thread ID.
456 pinnings = (int *)malloc(ONT*sizeof(int));
457 if( NULL == pinnings ) {
458 fprintf(stderr, "Error: cannot allocate space for experiment.\n");
459 return;
460 }
461
462 #pragma omp parallel default(shared)
463 {
464 int idx = omp_get_thread_num();
465
466 pinnings[idx] = sched_getcpu();
467 }
468
469 fprintf(ofp, "# Core:");
470 for(k=0; k<ONT; ++k) {
471 fprintf(ofp, " %d", pinnings[k]);
472 }
473 fprintf(ofp, "\n");
474
475 free(pinnings);
476
477 return;
478}
int i
int compar_lf(const void *a, const void *b)
Definition: compar.c:1
Convert a name to a numeric hardware event code.
return component an event belongs to
double f(double a)
Definition: cpi.c:23
int get_thread_count()
Definition: dcache.c:408
int varyBufferSizes(long long *values, double **rslts, double **counter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, int pattern, int latency_only, int mode, int ONT)
Definition: dcache.c:234
long long min_size
Definition: dcache.c:14
static void print_core_affinities(FILE *ofp)
Definition: dcache.c:447
int d_cache_test(int pattern, int max_iter, hw_desc_t *hw_desc, long long stride_in_bytes, float pages_per_block, char *papi_event_name, int latency_only, int mode, FILE *ofp)
Definition: dcache.c:118
static void print_cache_sizes(FILE *ofp_papi, hw_desc_t *hw_desc)
Definition: dcache.c:429
char * eventname
int is_core
Definition: dcache.c:15
long long max_size
Definition: dcache.c:14
static void print_header(FILE *ofp_papi, hw_desc_t *hw_desc)
Definition: dcache.c:422
void d_cache_driver(char *papi_event_name, cat_params_t params, hw_desc_t *hw_desc, int latency_only, int mode)
Definition: dcache.c:17
#define FACTOR
Definition: dcache.h:9
#define PAPI_OK
Definition: f90papi.h:73
static long long values[NUM_EVENTS]
Definition: init_fini.c:10
Return codes and api definitions.
FILE * stdout
FILE * stderr
int fclose(FILE *__stream)
int show_progress
Definition: params.h:9
int max_iter
Definition: params.h:7
char * outputdir
Definition: params.h:13
int maxPPB
Definition: hw_desc.h:9
int split[_MAX_SUPPORTED_CACHE_LEVELS]
Definition: hw_desc.h:12
int cache_levels
Definition: hw_desc.h:8
int pts_per_reg[_MAX_SUPPORTED_CACHE_LEVELS]
Definition: hw_desc.h:13
long long dcache_line_size[_MAX_SUPPORTED_CACHE_LEVELS]
Definition: hw_desc.h:14
long long dcache_size[_MAX_SUPPORTED_CACHE_LEVELS]
Definition: hw_desc.h:15
int pts_per_mm
Definition: hw_desc.h:11
int mmsplit
Definition: hw_desc.h:10
double counter[MAXTHREADS]
Definition: caches.h:42
double dt[MAXTHREADS]
Definition: caches.h:41
int status
Definition: caches.h:43
run_output_t probeBufferSize(long long active_buf_len, long long line_size, float pageCountPerBlock, int pattern, uintptr_t **v, uintptr_t *rslt, int latency_only, int mode, int ONT)
void error_handler(int e, int line)
#define CACHE_READ_WRITE