PAPI 7.1.0.0
Loading...
Searching...
No Matches
perf_helpers.h
Go to the documentation of this file.
1/*****************************************************************/
2/********* Begin perf_event low-level code ***********************/
3/*****************************************************************/
4
5/* In case headers aren't new enough to have __NR_perf_event_open */
6#ifndef __NR_perf_event_open
7
8#ifdef __powerpc__
9#define __NR_perf_event_open 319
10#elif defined(__x86_64__)
11#define __NR_perf_event_open 298
12#elif defined(__i386__)
13#define __NR_perf_event_open 336
14#elif defined(__arm__)
15#define __NR_perf_event_open 364
16#endif
17
18#endif
19
20static long
22 pid_t pid, int cpu, int group_fd, unsigned long flags )
23{
24 int ret;
25
26 ret = syscall( __NR_perf_event_open,
27 hw_event, pid, cpu, group_fd, flags );
28
29 return ret;
30}
31
32
33/*
34 * We define u64 as uint64_t for every architecture
35 * so that we can print it with "%"PRIx64 without getting warnings.
36 *
37 * typedef __u64 u64;
38 * typedef __s64 s64;
39 */
40typedef uint64_t u64;
41typedef int64_t s64;
42
43typedef __u32 u32;
44typedef __s32 s32;
45
46typedef __u16 u16;
47typedef __s16 s16;
48
49typedef __u8 u8;
50typedef __s8 s8;
51
52
53#ifdef __SIZEOF_INT128__
54static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
55{
56 return (u64)(((unsigned __int128)a * b) >> shift);
57}
58
59#else
60
61#ifdef __i386__
62static inline u64 mul_u32_u32(u32 a, u32 b)
63{
64 u32 high, low;
65
66 asm ("mull %[b]" : "=a" (low), "=d" (high)
67 : [a] "a" (a), [b] "rm" (b) );
68
69 return low | ((u64)high) << 32;
70}
71#else
72static inline u64 mul_u32_u32(u32 a, u32 b)
73{
74 return (u64)a * b;
75}
76#endif
77
78static inline u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
79{
80 u32 ah, al;
81 u64 ret;
82
83 al = a;
84 ah = a >> 32;
85
86 ret = mul_u32_u32(al, b) >> shift;
87 if (ah)
88 ret += mul_u32_u32(ah, b) << (32 - shift);
89
90 return ret;
91}
92
93#endif /* __SIZEOF_INT128__ */
94
95#ifndef ARRAY_SIZE
96#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
97#endif
98
99
100#if defined(__x86_64__) || defined(__i386__)
101
102
103static inline unsigned long long rdtsc(void) {
104
105 unsigned a,d;
106
107 __asm__ volatile("rdtsc" : "=a" (a), "=d" (d));
108
109 return ((unsigned long long)a) | (((unsigned long long)d) << 32);
110}
111
112static inline unsigned long long rdpmc(unsigned int counter) {
113
114 unsigned int low, high;
115
116 __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
117
118 return (unsigned long long)low | ((unsigned long long)high) <<32;
119}
120
121#define barrier() __asm__ volatile("" ::: "memory")
122
123
124#elif defined(__aarch64__)
125
126/* Indirect stringification. Doing two levels allows the parameter to be a
127 * macro itself. For example, compile with -DFOO=bar, __stringify(FOO)
128 * converts to "bar".
129 */
130
131#define __stringify_1(x...) #x
132#define __stringify(x...) __stringify_1(x)
133
134#define read_sysreg(r) ({ \
135 u64 __val; \
136 asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \
137 __val; \
138})
139
140static u64 read_pmccntr(void)
141{
142 return read_sysreg(pmccntr_el0);
143}
144
145#define PMEVCNTR_READ(idx) \
146 static u64 read_pmevcntr_##idx(void) { \
147 return read_sysreg(pmevcntr##idx##_el0); \
148 }
149
150PMEVCNTR_READ(0);
151PMEVCNTR_READ(1);
152PMEVCNTR_READ(2);
153PMEVCNTR_READ(3);
154PMEVCNTR_READ(4);
155PMEVCNTR_READ(5);
156PMEVCNTR_READ(6);
157PMEVCNTR_READ(7);
158PMEVCNTR_READ(8);
159PMEVCNTR_READ(9);
160PMEVCNTR_READ(10);
161PMEVCNTR_READ(11);
162PMEVCNTR_READ(12);
163PMEVCNTR_READ(13);
164PMEVCNTR_READ(14);
165PMEVCNTR_READ(15);
166PMEVCNTR_READ(16);
167PMEVCNTR_READ(17);
168PMEVCNTR_READ(18);
169PMEVCNTR_READ(19);
170PMEVCNTR_READ(20);
171PMEVCNTR_READ(21);
172PMEVCNTR_READ(22);
173PMEVCNTR_READ(23);
174PMEVCNTR_READ(24);
175PMEVCNTR_READ(25);
176PMEVCNTR_READ(26);
177PMEVCNTR_READ(27);
178PMEVCNTR_READ(28);
179PMEVCNTR_READ(29);
180PMEVCNTR_READ(30);
181
182/*
183 * Read a value direct from PMEVCNTR<idx>
184 */
185static u64 rdpmc(unsigned int counter)
186{
187 static u64 (* const read_f[])(void) = {
188 read_pmevcntr_0,
189 read_pmevcntr_1,
190 read_pmevcntr_2,
191 read_pmevcntr_3,
192 read_pmevcntr_4,
193 read_pmevcntr_5,
194 read_pmevcntr_6,
195 read_pmevcntr_7,
196 read_pmevcntr_8,
197 read_pmevcntr_9,
198 read_pmevcntr_10,
199 read_pmevcntr_11,
200 read_pmevcntr_13,
201 read_pmevcntr_12,
202 read_pmevcntr_14,
203 read_pmevcntr_15,
204 read_pmevcntr_16,
205 read_pmevcntr_17,
206 read_pmevcntr_18,
207 read_pmevcntr_19,
208 read_pmevcntr_20,
209 read_pmevcntr_21,
210 read_pmevcntr_22,
211 read_pmevcntr_23,
212 read_pmevcntr_24,
213 read_pmevcntr_25,
214 read_pmevcntr_26,
215 read_pmevcntr_27,
216 read_pmevcntr_28,
217 read_pmevcntr_29,
218 read_pmevcntr_30,
219 read_pmccntr
220 };
221
222 if (counter < ARRAY_SIZE(read_f))
223 return (read_f[counter])();
224
225 return 0;
226}
227
228static u64 rdtsc(void) { return read_sysreg(cntvct_el0); }
229
230#define barrier() asm volatile("dmb ish" : : : "memory")
231
232#endif
233
234#if defined(__x86_64__) || defined(__i386__) || defined(__aarch64__)
235
236static inline u64 adjust_cap_usr_time_short(u64 a, u64 b, u64 c)
237{
238 u64 ret;
239 ret = b + ((a - b) & c);
240 return ret;
241}
242
243/* based on the code in include/uapi/linux/perf_event.h */
244static inline unsigned long long mmap_read_self(void *addr,
245 int user_reset_flag,
246 unsigned long long reset,
247 unsigned long long *en,
248 unsigned long long *ru) {
249
250 struct perf_event_mmap_page *pc = addr;
251
252 uint32_t seq, time_mult = 0, time_shift = 0, index, width;
253 int64_t count;
254 uint64_t enabled, running;
255 uint64_t cyc = 0, time_offset = 0, time_cycles = 0, time_mask = ~0ULL;
256 int64_t pmc = 0;
257 uint64_t delta = 0;
258
259
260 do {
261 /* The kernel increments pc->lock any time */
262 /* perf_event_update_userpage() is called */
263 /* So by checking now, and the end, we */
264 /* can see if an update happened while we */
265 /* were trying to read things, and re-try */
266 /* if something changed */
267 /* The barrier ensures we get the most up to date */
268 /* version of the pc->lock variable */
269
270 seq=pc->lock;
271 barrier();
272
273 /* For multiplexing */
274 /* time_enabled is time the event was enabled */
275 enabled = pc->time_enabled;
276 /* time_running is time the event was actually running */
277 running = pc->time_running;
278
279 /* if cap_user_time is set, we can use rdtsc */
280 /* to calculate more exact enabled/running time */
281 /* for more accurate multiplex calculations */
282 if ( (pc->cap_user_time) && (enabled != running)) {
283 cyc = rdtsc();
284 time_offset = pc->time_offset;
285 time_mult = pc->time_mult;
286 time_shift = pc->time_shift;
287
288 if (pc->cap_user_time_short) {
289 time_cycles = pc->time_cycles;
290 time_mask = pc->time_mask;
291 }
292 }
293
294 /* actually do the measurement */
295
296 /* Index of register to read */
297 /* 0 means stopped/not-active */
298 /* Need to subtract 1 to get actual index to rdpmc() */
299 index = pc->index;
300
301 /* count is the value of the counter the last time */
302 /* the kernel read it */
303 /* If we don't sign extend it, we get large negative */
304 /* numbers which break if an IOC_RESET is done */
305 width = pc->pmc_width;
306 count = pc->offset;
307 if (user_reset_flag == 1) {
308 count = 0;
309 }
310
311 /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */
312 /* while actual perf_event.h has cap_user_rdpmc */
313
314 /* Only read if rdpmc enabled and event index valid */
315 /* Otherwise return the older (out of date?) count value */
316 if (pc->cap_usr_rdpmc && index) {
317
318 /* Read counter value */
319 pmc = rdpmc(index-1);
320
321 /* sign extend result */
322 if (user_reset_flag == 1) {
323 pmc-=reset;
324 }
325 pmc<<=(64-width);
326 pmc>>=(64-width);
327
328 /* add current count into the existing kernel count */
329 count+=pmc;
330 } else {
331 /* Falling back because rdpmc not supported */
332 /* for this event. */
333 return 0xffffffffffffffffULL;
334 }
335
336 barrier();
337
338 } while (pc->lock != seq);
339
340 if (enabled != running) {
341
342 /* Adjust for cap_usr_time_short, a nop if not */
343 cyc = adjust_cap_usr_time_short(cyc, time_cycles, time_mask);
344
345 delta = time_offset + mul_u64_u32_shr(cyc, time_mult, time_shift);
346
347 enabled+=delta;
348 if (index)
349 /* Only adjust if index is valid */
350 running+=delta;
351 }
352
353 if (en) *en=enabled;
354 if (ru) *ru=running;
355
356 return count;
357}
358
359static inline unsigned long long mmap_read_reset_count(void *addr) {
360
361 struct perf_event_mmap_page *pc = addr;
362 uint32_t seq, index;
363 uint64_t count = 0;
364
365 if (pc == NULL) {
366 return count;
367 }
368
369 do {
370 /* The barrier ensures we get the most up to date */
371 /* version of the pc->lock variable */
372
373 seq=pc->lock;
374 barrier();
375
376 /* actually do the measurement */
377
378 /* Ugh, libpfm4 perf_event.h has cap_usr_rdpmc */
379 /* while actual perf_event.h has cap_user_rdpmc */
380
381 /* Index of register to read */
382 /* 0 means stopped/not-active */
383 /* Need to subtract 1 to get actual index to rdpmc() */
384 index = pc->index;
385
386 if (pc->cap_usr_rdpmc && index) {
387 /* Read counter value */
388 count = rdpmc(index-1);
389 }
390 barrier();
391
392 } while (pc->lock != seq);
393
394 return count;
395}
396
397#else
398static inline unsigned long long mmap_read_self(void *addr __attribute__((unused)),
399 int user_reset_flag __attribute__((unused)),
400 unsigned long long reset __attribute__((unused)),
401 unsigned long long *en __attribute__((unused)),
402 unsigned long long *ru __attribute__((unused))) {
403
404 return (unsigned long long)(-1);
405}
406
407static inline unsigned long long mmap_read_reset_count(void *addr __attribute__((unused))) {
408
409 return (unsigned long long)(-1);
410}
411
412#endif
413
414/* These functions are based on builtin-record.c in the */
415/* kernel's tools/perf directory. */
416/* This code is from a really ancient version of perf */
417/* And should be updated/commented properly */
418
419
420static uint64_t
422{
423 struct perf_event_mmap_page *pc = pe->mmap_buf;
424 int head;
425
426 if ( pc == NULL ) {
427 PAPIERROR( "perf_event_mmap_page is NULL" );
428 return 0;
429 }
430
431 head = pc->data_head;
432 rmb();
433
434 return head;
435}
436
437static void
439{
440 struct perf_event_mmap_page *pc = pe->mmap_buf;
441
442 /* ensure all reads are done before we write the tail out. */
443 pc->data_tail = tail;
444}
445
446/* Does the kernel define these somewhere? */
447struct ip_event {
448 struct perf_event_header header;
449 uint64_t ip;
450};
452 struct perf_event_header header;
453 uint64_t id;
454 uint64_t lost;
455};
456typedef union event_union {
457 struct perf_event_header header;
458 struct ip_event ip;
459 struct lost_event lost;
461
462/* Should re-write with comments if we ever figure out what's */
463/* going on here. */
464static void
466 int profile_index )
467{
468 uint64_t head = mmap_read_head( pe );
469 uint64_t old = pe->tail;
470 unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize();
471 int diff;
472
473 diff = head - old;
474 if ( diff < 0 ) {
475 SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
476 ", tail = %" PRIu64 ". Discarding samples.\n", head, old );
477 /* head points to a known good entry, start there. */
478 old = head;
479 }
480
481 for( ; old != head; ) {
482 perf_sample_event_t *event = ( perf_sample_event_t * )& data[old & pe->mask];
483 perf_sample_event_t event_copy;
484 size_t size = event->header.size;
485
486 /* Event straddles the mmap boundary -- header should always */
487 /* be inside due to u64 alignment of output. */
488 if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
489 uint64_t offset = old;
490 uint64_t len = min( sizeof ( *event ), size ), cpy;
491 void *dst = &event_copy;
492
493 do {
494 cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
495 memcpy( dst, &data[offset & pe->mask], cpy );
496 offset += cpy;
497 dst = ((unsigned char*)dst) + cpy;
498 len -= cpy;
499 } while ( len );
500
501 event = &event_copy;
502 }
503 old += size;
504
505 SUBDBG( "event->type = %08x\n", event->header.type );
506 SUBDBG( "event->size = %d\n", event->header.size );
507
508 switch ( event->header.type ) {
509 case PERF_RECORD_SAMPLE:
510 _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
511 ( vptr_t ) ( unsigned long ) event->ip.ip,
512 0, profile_index );
513 break;
514
515 case PERF_RECORD_LOST:
516 SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
517 " events were lost.\n"
518 "Loss was recorded when counter id %#"PRIx64
519 " overflowed.\n", event->lost.lost, event->lost.id );
520 break;
521 default:
522 SUBDBG( "Error: unexpected header type - %d\n",
523 event->header.type );
524 break;
525 }
526 }
527
528 pe->tail = old;
529 mmap_write_tail( pe, old );
530}
531
532
static long count
#define min(x, y)
Definition: darwin-common.h:4
void _papi_hwi_dispatch_profile(EventSetInfo_t *ESI, vptr_t pc, long long over, int profile_index)
Definition: extras.c:165
static double a[MATRIX_SIZE][MATRIX_SIZE]
Definition: libmsr_basic.c:38
static double b[MATRIX_SIZE][MATRIX_SIZE]
Definition: libmsr_basic.c:39
static double c[MATRIX_SIZE][MATRIX_SIZE]
Definition: libmsr_basic.c:40
unsigned long AO_t __attribute__((__aligned__(4)))
Definition: m68k.h:21
void * vptr_t
Definition: papi.h:576
#define SUBDBG(format, args...)
Definition: papi_debug.h:64
void PAPIERROR(char *format,...)
static int cidx
static uint64_t mmap_read_head(pe_event_info_t *pe)
Definition: perf_helpers.h:421
static u64 mul_u32_u32(u32 a, u32 b)
Definition: perf_helpers.h:72
__u8 u8
Definition: perf_helpers.h:49
__s32 s32
Definition: perf_helpers.h:44
__s16 s16
Definition: perf_helpers.h:47
static u64 mul_u64_u32_shr(u64 a, u32 b, unsigned int shift)
Definition: perf_helpers.h:78
int64_t s64
Definition: perf_helpers.h:41
#define ARRAY_SIZE(arr)
Definition: perf_helpers.h:96
uint64_t u64
Definition: perf_helpers.h:40
static long sys_perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
Definition: perf_helpers.h:21
__u32 u32
Definition: perf_helpers.h:43
__u16 u16
Definition: perf_helpers.h:46
static unsigned long long mmap_read_reset_count(void *addr __attribute__((unused)))
Definition: perf_helpers.h:407
static void mmap_write_tail(pe_event_info_t *pe, uint64_t tail)
Definition: perf_helpers.h:438
static unsigned long long mmap_read_self(void *addr __attribute__((unused)), int user_reset_flag __attribute__((unused)), unsigned long long reset __attribute__((unused)), unsigned long long *en __attribute__((unused)), unsigned long long *ru __attribute__((unused)))
Definition: perf_helpers.h:398
static void mmap_read(int cidx, ThreadInfo_t **thr, pe_event_info_t *pe, int profile_index)
Definition: perf_helpers.h:465
__s8 s8
Definition: perf_helpers.h:50
static const pme_power_entry_t * pe
if(file==NULL) goto out
long long int long long
Definition: sde_internal.h:85
static int pid
uint64_t ip
Definition: perf_helpers.h:449
struct perf_event_header header
Definition: perf_helpers.h:448
struct perf_event_header header
Definition: perf_helpers.h:452
uint64_t lost
Definition: perf_helpers.h:454
uint64_t id
Definition: perf_helpers.h:453
struct perf_event_header header
Definition: perf_helpers.h:457
uint64_t rdpmc(int c)
Definition: vmware.c:93