PAPI  5.3.2.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
perf_event.c
Go to the documentation of this file.
1 /*
2 * File: perf_event.c
3 *
4 * Author: Corey Ashford
5 * cjashfor@us.ibm.com
6 * - based upon perfmon.c written by -
7 * Philip Mucci
8 * mucci@cs.utk.edu
9 * Mods: Gary Mohr
10 * gary.mohr@bull.com
11 * Mods: Vince Weaver
12 * vweaver1@eecs.utk.edu
13 * Mods: Philip Mucci
14 * mucci@eecs.utk.edu */
15 
16 
17 #include <fcntl.h>
18 #include <string.h>
19 #include <errno.h>
20 #include <signal.h>
21 #include <syscall.h>
22 #include <sys/utsname.h>
23 #include <sys/mman.h>
24 #include <sys/ioctl.h>
25 
26 /* PAPI-specific includes */
27 #include "papi.h"
28 #include "papi_memory.h"
29 #include "papi_internal.h"
30 #include "papi_vector.h"
31 #include "extras.h"
32 
33 /* libpfm4 includes */
34 #include "papi_libpfm4_events.h"
35 #include "pe_libpfm4_events.h"
36 #include "perfmon/pfmlib.h"
37 #include PEINCLUDE
38 
39 /* Linux-specific includes */
40 #include "mb.h"
41 #include "linux-memory.h"
42 #include "linux-timer.h"
43 #include "linux-common.h"
44 #include "linux-context.h"
45 
46 #include "perf_event_lib.h"
47 
48 /* Defines for ctx->state */
49 #define PERF_EVENTS_OPENED 0x01
50 #define PERF_EVENTS_RUNNING 0x02
51 
52 /* Static globals */
54 
55 /* Forward declaration */
57 
58 /* Globals */
61 
62 /* These sentinels tell _pe_set_overflow() how to set the */
63 /* wakeup_events field in the event descriptor record. */
64 
65 #define WAKEUP_COUNTER_OVERFLOW 0
66 #define WAKEUP_PROFILING -1
67 
68 #define WAKEUP_MODE_COUNTER_OVERFLOW 0
69 #define WAKEUP_MODE_PROFILING 1
70 
71 /* The kernel developers say to never use a refresh value of 0 */
72 /* See https://lkml.org/lkml/2011/5/24/172 */
73 /* However, on some platforms (like Power) a value of 1 does not work */
74 /* We're still tracking down why this happens. */
75 
76 #if defined(__powerpc__)
77 #define PAPI_REFRESH_VALUE 0
78 #else
79 #define PAPI_REFRESH_VALUE 1
80 #endif
81 
82 /* Check for processor support */
83 /* Can be used for generic checking, though in general we only */
84 /* check for pentium4 here because support was broken for multiple */
85 /* kernel releases and the usual standard detections did not */
86 /* handle this. So we check for pentium 4 explicitly. */
87 static int
88 processor_supported(int vendor, int family) {
89 
90  /* Error out if kernel too early to support p4 */
91  if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) {
92  if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) {
93  PAPIERROR("Pentium 4 not supported on kernels before 2.6.35");
94  return PAPI_ENOSUPP;
95  }
96  }
97  return PAPI_OK;
98 }
99 
100 /* Fix up the config based on what CPU/Vendor we are running on */
101 static int
103 {
104  /* powerpc */
105  /* On IBM and Power6 Machines default domain should include supervisor */
107  vector->cmp_info.available_domains |=
109  if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) {
110  vector->cmp_info.default_domain =
112  }
113  }
114 
117  }
118 
121  vector->cmp_info.fast_real_timer = 1;
122  }
123  /* ARM */
125  /* FIXME: this will change with Cortex A15 */
126  vector->cmp_info.available_domains |=
128  vector->cmp_info.default_domain =
130  }
131 
132  /* CRAY */
135  }
136 
137  return PAPI_OK;
138 }
139 
140 
141 
142 /******************************************************************/
143 /******** Kernel Version Dependent Routines **********************/
144 /******************************************************************/
145 
146 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch
147  * implementations (e.g. x86) which don't do a static event scheduability
148  * check in sys_perf_event_open.
149  * This was fixed for x86 in the 2.6.33 kernel
150  *
151  * Also! Kernels newer than 2.6.34 will fail in a similar way
152  * if the nmi_watchdog has stolen a performance counter
153  * and we try to use the maximum number of counters.
154  * A sys_perf_event_open() will seem to succeed but will fail
155  * at read time. So re-use this work around code.
156  */
157 static int
159 
160 #if defined(__powerpc__)
161  /* PowerPC not affected by this bug */
162 #elif defined(__mips__)
163  /* MIPS as of kernel 3.1 does not properly detect schedulability */
164  return 1;
165 #else
166  if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
167 #endif
168 
169  if (nmi_watchdog_active) return 1;
170 
171  return 0;
172 }
173 
174 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once */
175 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results */
176 /* from attached processes. We are lazy and disable it for all cases */
177 /* commit was: 050735b08ca8a016bbace4445fa025b88fee770b */
178 
179 static int
181 
182  if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1;
183 
184  /* MIPS, as of version 3.1, does not support this properly */
185 
186 #if defined(__mips__)
187  return 1;
188 #endif
189 
190  return 0;
191 
192 }
193 
194 
195 /* There's a bug prior to Linux 2.6.33 where if you are using */
196 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and */
197 /* TOTAL_TIME_RUNNING fields will be zero unless you disable */
198 /* the counters first */
199 static int
201 
202  if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
203 
204  return 0;
205 
206 }
207 
208 
209 /* Set the F_SETOWN_EX flag on the fd. */
210 /* This affects which thread an overflow signal gets sent to */
211 /* Handled in a subroutine to handle the fact that the behavior */
212 /* is dependent on kernel version. */
213 static int
215 
216  int ret;
217  struct f_owner_ex fown_ex;
218 
219  /* F_SETOWN_EX is not available until 2.6.32 */
220  if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) {
221 
222  /* get ownership of the descriptor */
223  ret = fcntl( fd, F_SETOWN, mygettid( ) );
224  if ( ret == -1 ) {
225  PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) );
226  return PAPI_ESYS;
227  }
228  }
229  else {
230  /* set ownership of the descriptor */
231  fown_ex.type = F_OWNER_TID;
232  fown_ex.pid = mygettid();
233  ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex );
234 
235  if ( ret == -1 ) {
236  PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s",
237  fd, strerror( errno ) );
238  return PAPI_ESYS;
239  }
240  }
241  return PAPI_OK;
242 }
243 
244 /* The read format on perf_event varies based on various flags that */
245 /* are passed into it. This helper avoids copying this logic */
246 /* multiple places. */
247 static unsigned int
249  unsigned int inherit,
250  int format_group )
251 {
252  unsigned int format = 0;
253 
254  /* if we need read format options for multiplexing, add them now */
255  if (multiplex) {
256  format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
257  format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
258  }
259 
260  /* if our kernel supports it and we are not using inherit, */
261  /* add the group read options */
262  if ( (!bug_format_group()) && !inherit) {
263  if (format_group) {
264  format |= PERF_FORMAT_GROUP;
265  }
266  }
267 
268  SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n",
269  multiplex, inherit, format_group, format);
270 
271  return format;
272 }
273 
274 /*****************************************************************/
275 /********* End Kernel-version Dependent Routines ****************/
276 /*****************************************************************/
277 
278 /*****************************************************************/
279 /********* Begin perf_event low-level code ***********************/
280 /*****************************************************************/
281 
282 /* In case headers aren't new enough to have __NR_perf_event_open */
283 #ifndef __NR_perf_event_open
284 
285 #ifdef __powerpc__
286 #define __NR_perf_event_open 319
287 #elif defined(__x86_64__)
288 #define __NR_perf_event_open 298
289 #elif defined(__i386__)
290 #define __NR_perf_event_open 336
291 #elif defined(__arm__) 366+0x900000
292 #define __NR_perf_event_open
293 #endif
294 
295 #endif
296 
297 static long
298 sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu,
299  int group_fd, unsigned long flags )
300 {
301  int ret;
302 
303  SUBDBG("sys_perf_event_open(%p,%d,%d,%d,%lx\n",hw_event,pid,cpu,group_fd,flags);
304  SUBDBG(" type: %d\n",hw_event->type);
305  SUBDBG(" size: %d\n",hw_event->size);
306  SUBDBG(" config: %"PRIx64" (%"PRIu64")\n",hw_event->config,
307  hw_event->config);
308  SUBDBG(" sample_period: %"PRIu64"\n",hw_event->sample_period);
309  SUBDBG(" sample_type: %"PRIu64"\n",hw_event->sample_type);
310  SUBDBG(" read_format: %"PRIu64"\n",hw_event->read_format);
311  SUBDBG(" disabled: %d\n",hw_event->disabled);
312  SUBDBG(" inherit: %d\n",hw_event->inherit);
313  SUBDBG(" pinned: %d\n",hw_event->pinned);
314  SUBDBG(" exclusive: %d\n",hw_event->exclusive);
315  SUBDBG(" exclude_user: %d\n",hw_event->exclude_user);
316  SUBDBG(" exclude_kernel: %d\n",hw_event->exclude_kernel);
317  SUBDBG(" exclude_hv: %d\n",hw_event->exclude_hv);
318  SUBDBG(" exclude_idle: %d\n",hw_event->exclude_idle);
319  SUBDBG(" mmap: %d\n",hw_event->mmap);
320  SUBDBG(" comm: %d\n",hw_event->comm);
321  SUBDBG(" freq: %d\n",hw_event->freq);
322  SUBDBG(" inherit_stat: %d\n",hw_event->inherit_stat);
323  SUBDBG(" enable_on_exec: %d\n",hw_event->enable_on_exec);
324  SUBDBG(" task: %d\n",hw_event->task);
325  SUBDBG(" watermark: %d\n",hw_event->watermark);
326  ret =
327  syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
328  SUBDBG("Returned %d %d %s\n",ret,
329  ret<0?errno:0,
330  ret<0?strerror(errno):" ");
331  return ret;
332 }
333 
334 
335 static int map_perf_event_errors_to_papi(int perf_event_error) {
336 
337  int ret;
338 
339  /* These mappings are approximate.
340  EINVAL in particular can mean lots of different things */
341  switch(perf_event_error) {
342  case EPERM:
343  case EACCES:
344  ret = PAPI_EPERM;
345  break;
346  case ENODEV:
347  case EOPNOTSUPP:
348  ret = PAPI_ENOSUPP;
349  break;
350  case ENOENT:
351  ret = PAPI_ENOEVNT;
352  break;
353  case ENOSYS:
354  case EAGAIN:
355  case EBUSY:
356  case E2BIG: /* Only happens if attr is the wrong size somehow */
357  case EBADF: /* We are attempting to group with an invalid file descriptor */
358  ret = PAPI_ESYS;
359  break;
360  case ENOMEM:
361  ret = PAPI_ENOMEM;
362  break;
363  case EMFILE: /* Out of file descriptors. Typically max out at 1024 */
364  ret = PAPI_ECOUNT;
365  break;
366  case EINVAL:
367  default:
368  ret = PAPI_EINVAL;
369  break;
370  }
371  return ret;
372 }
373 
374 
376 /* perf_events. */
377 /* We do this by temporarily opening an event with the */
378 /* desired options then closing it again. We use the */
379 /* PERF_COUNT_HW_INSTRUCTION event as a dummy event */
380 /* on the assumption it is available on all */
381 /* platforms. */
382 
383 static int
384 check_permissions( unsigned long tid,
385  unsigned int cpu_num,
386  unsigned int domain,
387  unsigned int granularity,
388  unsigned int multiplex,
389  unsigned int inherit )
390 {
391  int ev_fd;
392  struct perf_event_attr attr;
393 
394  long pid;
395 
396  /* clearing this will set a type of hardware and to count all domains */
397  memset(&attr, '\0', sizeof(attr));
398  attr.read_format = get_read_format(multiplex, inherit, 1);
399 
400  /* set the event id (config field) to instructios */
401  /* (an event that should always exist) */
402  /* This was cycles but that is missing on Niagara */
403  attr.config = PERF_COUNT_HW_INSTRUCTIONS;
404 
405  /* now set up domains this event set will be counting */
406  if (!(domain & PAPI_DOM_SUPERVISOR)) {
407  attr.exclude_hv = 1;
408  }
409  if (!(domain & PAPI_DOM_USER)) {
410  attr.exclude_user = 1;
411  }
412  if (!(domain & PAPI_DOM_KERNEL)) {
413  attr.exclude_kernel = 1;
414  }
415 
416  if (granularity==PAPI_GRN_SYS) {
417  pid = -1;
418  } else {
419  pid = tid;
420  }
421 
422  SUBDBG("Calling sys_perf_event_open() from check_permissions\n");
423 
424  ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 );
425  if ( ev_fd == -1 ) {
426  SUBDBG("sys_perf_event_open returned error. Linux says, %s",
427  strerror( errno ) );
429  }
430 
431  /* now close it, this was just to make sure we have permissions */
432  /* to set these options */
433  close(ev_fd);
434  return PAPI_OK;
435 }
436 
437 /* Maximum size we ever expect to read from a perf_event fd */
438 /* (this is the number of 64-bit values) */
439 /* We use this to size the read buffers */
440 /* The three is for event count, time_enabled, time_running */
441 /* and the counter term is count value and count id for each */
442 /* possible counter value. */
443 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))
444 
445 
446 
447 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
448 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */
449 /* scheduability check in sys_perf_event_open. It is also needed if the */
450 /* kernel is stealing an event, such as when NMI watchdog is enabled. */
451 
452 static int
454 {
455  int retval = 0, cnt = -1;
456  ( void ) ctx; /*unused */
457  long long papi_pe_buffer[READ_BUFFER_SIZE];
458  int i,group_leader_fd;
459 
460  if (bug_check_scheduability()) {
461 
462  /* If the kernel isn't tracking scheduability right */
463  /* Then we need to start/stop/read to force the event */
464  /* to be scheduled and see if an error condition happens. */
465 
466  /* get the proper fd to start */
467  group_leader_fd=ctl->events[idx].group_leader_fd;
468  if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd;
469 
470  /* start the event */
471  retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL );
472  if (retval == -1) {
473  PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
474  return PAPI_ESYS;
475  }
476 
477  /* stop the event */
478  retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL );
479  if (retval == -1) {
480  PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" );
481  return PAPI_ESYS;
482  }
483 
484  /* See if a read returns any results */
485  cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
486  if ( cnt == -1 ) {
487  SUBDBG( "read returned an error! Should never happen.\n" );
488  return PAPI_ESYS;
489  }
490 
491  if ( cnt == 0 ) {
492  /* We read 0 bytes if we could not schedule the event */
493  /* The kernel should have detected this at open */
494  /* but various bugs (including NMI watchdog) */
495  /* result in this behavior */
496 
497  return PAPI_ECNFLCT;
498 
499  } else {
500 
501  /* Reset all of the counters (opened so far) back to zero */
502  /* from the above brief enable/disable call pair. */
503 
504  /* We have to reset all events because reset of group leader */
505  /* does not reset all. */
506  /* we assume that the events are being added one by one and that */
507  /* we do not need to reset higher events (doing so may reset ones */
508  /* that have not been initialized yet. */
509 
510  /* Note... PERF_EVENT_IOC_RESET does not reset time running */
511  /* info if multiplexing, so we should avoid coming here if */
512  /* we are multiplexing the event. */
513  for( i = 0; i < idx; i++) {
514  retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
515  if (retval == -1) {
516  PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d "
517  "(fd %d)failed.\n",
518  i,ctl->num_events,idx,ctl->events[i].event_fd);
519  return PAPI_ESYS;
520  }
521  }
522  }
523  }
524  return PAPI_OK;
525 }
526 
527 
528 /* Do some extra work on a perf_event fd if we're doing sampling */
529 /* This mostly means setting up the mmap buffer. */
530 static int
531 tune_up_fd( pe_control_t *ctl, int evt_idx )
532 {
533  int ret;
534  void *buf_addr;
535  int fd = ctl->events[evt_idx].event_fd;
536 
537  /* Register that we would like a SIGIO notification when a mmap'd page */
538  /* becomes full. */
539  ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK );
540  if ( ret ) {
541  PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) "
542  "returned error: %s", fd, strerror( errno ) );
543  return PAPI_ESYS;
544  }
545 
546  /* Set the F_SETOWN_EX flag on the fd. */
547  /* This affects which thread an overflow signal gets sent to. */
548  ret=fcntl_setown_fd(fd);
549  if (ret!=PAPI_OK) return ret;
550 
551  /* Set FD_CLOEXEC. Otherwise if we do an exec with an overflow */
552  /* running, the overflow handler will continue into the exec()'d*/
553  /* process and kill it because no signal handler is set up. */
554  ret=fcntl(fd, F_SETFD, FD_CLOEXEC);
555  if (ret) {
556  return PAPI_ESYS;
557  }
558 
559  /* when you explicitely declare that you want a particular signal, */
560  /* even with you use the default signal, the kernel will send more */
561  /* information concerning the event to the signal handler. */
562  /* */
563  /* In particular, it will send the file descriptor from which the */
564  /* event is originating which can be quite useful when monitoring */
565  /* multiple tasks from a single thread. */
566  ret = fcntl( fd, F_SETSIG, ctl->overflow_signal );
567  if ( ret == -1 ) {
568  PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s",
569  ctl->overflow_signal, fd,
570  strerror( errno ) );
571  return PAPI_ESYS;
572  }
573 
574  /* mmap() the sample buffer */
575  buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
576  PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
577  if ( buf_addr == MAP_FAILED ) {
578  PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s",
579  ctl->events[evt_idx].nr_mmap_pages * getpagesize( ),
580  PROT_READ, MAP_SHARED, fd, strerror( errno ) );
581  return ( PAPI_ESYS );
582  }
583 
584  SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr );
585 
586  /* Set up the mmap buffer and its associated helpers */
587  ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr;
588  ctl->events[evt_idx].tail = 0;
589  ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) *
590  getpagesize() - 1;
591 
592  return PAPI_OK;
593 }
594 
595 
596 
597 /* Open all events in the control state */
598 static int
600 {
601 
602  int i, ret = PAPI_OK;
603  long pid;
604 
605  if (ctl->granularity==PAPI_GRN_SYS) {
606  pid = -1;
607  }
608  else {
609  pid = ctl->tid;
610  }
611 
612  for( i = 0; i < ctl->num_events; i++ ) {
613 
614  ctl->events[i].event_opened=0;
615 
616  /* set up the attr structure. We don't set up all fields here */
617  /* as some have already been set up previously. */
618 
619  /* group leader (event 0) is special */
620  /* If we're multiplexed, everyone is a group leader */
621  if (( i == 0 ) || (ctl->multiplexed)) {
622  ctl->events[i].attr.pinned = !ctl->multiplexed;
623  ctl->events[i].attr.disabled = 1;
624  ctl->events[i].group_leader_fd=-1;
625  ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed,
626  ctl->inherit,
627  !ctl->multiplexed );
628  } else {
629  ctl->events[i].attr.pinned=0;
630  ctl->events[i].attr.disabled = 0;
631  ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
632  ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed,
633  ctl->inherit,
634  0 );
635  }
636 
637 
638  /* try to open */
639  ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr,
640  pid,
641  ctl->cpu,
642  ctl->events[i].group_leader_fd,
643  0 /* flags */
644  );
645 
646  /* Try to match Linux errors to PAPI errors */
647  if ( ctl->events[i].event_fd == -1 ) {
648  SUBDBG("sys_perf_event_open returned error on event #%d."
649  " Error: %s\n",
650  i, strerror( errno ) );
652 
653  goto open_pe_cleanup;
654  }
655 
656  SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
657  " group_leader/fd: %d, event_fd: %d,"
658  " read_format: %#"PRIu64"\n",
659  pid, ctl->cpu, ctl->events[i].group_leader_fd,
660  ctl->events[i].event_fd, ctl->events[i].attr.read_format);
661 
662 
663  /* in many situations the kernel will indicate we opened fine */
664  /* yet things will fail later. So we need to double check */
665  /* we actually can use the events we've set up. */
666 
667  /* This is not necessary if we are multiplexing, and in fact */
668  /* we cannot do this properly if multiplexed because */
669  /* PERF_EVENT_IOC_RESET does not reset the time running info */
670  if (!ctl->multiplexed) {
671  ret = check_scheduability( ctx, ctl, i );
672 
673  if ( ret != PAPI_OK ) {
674  /* the last event did open, so we need to bump the counter */
675  /* before doing the cleanup */
676  i++;
677  goto open_pe_cleanup;
678  }
679  }
680  ctl->events[i].event_opened=1;
681  }
682 
683  /* Now that we've successfully opened all of the events, do whatever */
684  /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
685  /* and so on. */
686  for ( i = 0; i < ctl->num_events; i++ ) {
687 
688  /* If sampling is enabled, hook up signal handler */
689  if ( ctl->events[i].attr.sample_period ) {
690  ret = tune_up_fd( ctl, i );
691  if ( ret != PAPI_OK ) {
692  /* All of the fds are open, so we need to clean up all of them */
693  i = ctl->num_events;
694  goto open_pe_cleanup;
695  }
696  } else {
697  /* Make sure this is NULL so close_pe_events works right */
698  ctl->events[i].mmap_buf = NULL;
699  }
700  }
701 
702  /* Set num_evts only if completely successful */
703  ctx->state |= PERF_EVENTS_OPENED;
704 
705  return PAPI_OK;
706 
707 open_pe_cleanup:
708  /* We encountered an error, close up the fds we successfully opened. */
709  /* We go backward in an attempt to close group leaders last, although */
710  /* That's probably not strictly necessary. */
711  while ( i > 0 ) {
712  i--;
713  if (ctl->events[i].event_fd>=0) {
714  close( ctl->events[i].event_fd );
715  ctl->events[i].event_opened=0;
716  }
717  }
718 
719  return ret;
720 }
721 
722 /* Close all of the opened events */
723 static int
725 {
726  int i;
727  int num_closed=0;
728  int events_not_opened=0;
729 
730  /* should this be a more serious error? */
731  if ( ctx->state & PERF_EVENTS_RUNNING ) {
732  SUBDBG("Closing without stopping first\n");
733  }
734 
735  /* Close child events first */
736  for( i=0; i<ctl->num_events; i++ ) {
737 
738  if (ctl->events[i].event_opened) {
739 
740  if (ctl->events[i].group_leader_fd!=-1) {
741  if ( ctl->events[i].mmap_buf ) {
742  if ( munmap ( ctl->events[i].mmap_buf,
743  ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
744  PAPIERROR( "munmap of fd = %d returned error: %s",
745  ctl->events[i].event_fd, strerror( errno ) );
746  return PAPI_ESYS;
747  }
748  }
749 
750  if ( close( ctl->events[i].event_fd ) ) {
751  PAPIERROR( "close of fd = %d returned error: %s",
752  ctl->events[i].event_fd, strerror( errno ) );
753  return PAPI_ESYS;
754  } else {
755  num_closed++;
756  }
757  ctl->events[i].event_opened=0;
758  }
759  }
760  else {
761  events_not_opened++;
762  }
763  }
764 
765  /* Close the group leaders last */
766  for( i=0; i<ctl->num_events; i++ ) {
767 
768  if (ctl->events[i].event_opened) {
769 
770  if (ctl->events[i].group_leader_fd==-1) {
771  if ( ctl->events[i].mmap_buf ) {
772  if ( munmap ( ctl->events[i].mmap_buf,
773  ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
774  PAPIERROR( "munmap of fd = %d returned error: %s",
775  ctl->events[i].event_fd, strerror( errno ) );
776  return PAPI_ESYS;
777  }
778  }
779 
780 
781  if ( close( ctl->events[i].event_fd ) ) {
782  PAPIERROR( "close of fd = %d returned error: %s",
783  ctl->events[i].event_fd, strerror( errno ) );
784  return PAPI_ESYS;
785  } else {
786  num_closed++;
787  }
788  ctl->events[i].event_opened=0;
789  }
790  }
791  }
792 
793 
794  if (ctl->num_events!=num_closed) {
795  if (ctl->num_events!=(num_closed+events_not_opened)) {
796  PAPIERROR("Didn't close all events: "
797  "Closed %d Not Opened: %d Expected %d\n",
798  num_closed,events_not_opened,ctl->num_events);
799  return PAPI_EBUG;
800  }
801  }
802 
803  ctl->num_events=0;
804 
805  ctx->state &= ~PERF_EVENTS_OPENED;
806 
807  return PAPI_OK;
808 }
809 
810 
811 /********************************************************************/
812 /********************************************************************/
813 /* Functions that are exported via the component interface */
814 /********************************************************************/
815 /********************************************************************/
816 
817 
818 /* set the domain. FIXME: perf_events allows per-event control of this. */
819 /* we do not handle that yet. */
820 int
822 {
823 
824  int i;
825  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
826 
827  SUBDBG("old control domain %d, new domain %d\n",
828  pe_ctl->domain,domain);
829 
830  pe_ctl->domain = domain;
831 
832  /* Force the domain on all events */
833  for( i = 0; i < pe_ctl->num_events; i++ ) {
834  pe_ctl->events[i].attr.exclude_user =
835  !( pe_ctl->domain & PAPI_DOM_USER );
836  pe_ctl->events[i].attr.exclude_kernel =
837  !( pe_ctl->domain & PAPI_DOM_KERNEL );
838  pe_ctl->events[i].attr.exclude_hv =
839  !( pe_ctl->domain & PAPI_DOM_SUPERVISOR );
840  }
841  return PAPI_OK;
842 }
843 
844 /* Shutdown a thread */
845 int
847 {
848  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
849 
850  pe_ctx->initialized=0;
851 
852  return PAPI_OK;
853 }
854 
855 
856 /* reset the hardware counters */
857 /* Note: PAPI_reset() does not necessarily call this */
858 /* unless the events are actually running. */
859 int
861 {
862  int i, ret;
863  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
864 
865  ( void ) ctx; /*unused */
866 
867  /* We need to reset all of the events, not just the group leaders */
868  for( i = 0; i < pe_ctl->num_events; i++ ) {
869  ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
870  if ( ret == -1 ) {
871  PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
872  "returned error, Linux says: %s",
873  pe_ctl->events[i].event_fd, strerror( errno ) );
874  return PAPI_ESYS;
875  }
876  }
877 
878  return PAPI_OK;
879 }
880 
881 
882 /* write (set) the hardware counters */
883 /* Current we do not support this. */
884 int
886  long long *from )
887 {
888  ( void ) ctx; /*unused */
889  ( void ) ctl; /*unused */
890  ( void ) from; /*unused */
891  /*
892  * Counters cannot be written. Do we need to virtualize the
893  * counters so that they can be written, or perhaps modify code so that
894  * they can be written? FIXME ?
895  */
896 
897  return PAPI_ENOSUPP;
898 }
899 
900 /*
901  * perf_event provides a complicated read interface.
902  * the info returned by read() varies depending on whether
903  * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
904  * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
905  *
906  * To simplify things we just always ask for everything. This might
907  * lead to overhead when reading more than we need, but it makes the
908  * read code a lot simpler than the original implementation we had here.
909  *
910  * For more info on the layout see include/linux/perf_event.h
911  *
912  */
913 
914 int
916  long long **events, int flags )
917 {
918  ( void ) flags; /*unused */
919  int i, ret = -1;
920  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
921  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
922  long long papi_pe_buffer[READ_BUFFER_SIZE];
923  long long tot_time_running, tot_time_enabled, scale;
924 
925  /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */
926  /* fields are always 0 unless the counter is disabled. So if we are on */
927  /* one of these kernels, then we must disable events before reading. */
928 
929  /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */
930  /* so maybe this isn't even necessary. */
931 
932  if (bug_sync_read()) {
933  if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
934  for ( i = 0; i < pe_ctl->num_events; i++ ) {
935  /* disable only the group leaders */
936  if ( pe_ctl->events[i].group_leader_fd == -1 ) {
937  ret = ioctl( pe_ctl->events[i].event_fd,
938  PERF_EVENT_IOC_DISABLE, NULL );
939  if ( ret == -1 ) {
940  PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) "
941  "returned an error: ", strerror( errno ));
942  return PAPI_ESYS;
943  }
944  }
945  }
946  }
947  }
948 
949 
950  /* Handle case where we are multiplexing */
951  if (pe_ctl->multiplexed) {
952 
953  /* currently we handle multiplexing by having individual events */
954  /* so we read from each in turn. */
955 
956  for ( i = 0; i < pe_ctl->num_events; i++ ) {
957 
958  ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
959  sizeof ( papi_pe_buffer ) );
960  if ( ret == -1 ) {
961  PAPIERROR("read returned an error: ", strerror( errno ));
962  return PAPI_ESYS;
963  }
964 
965  /* We should read 3 64-bit values from the counter */
966  if (ret<(signed)(3*sizeof(long long))) {
967  PAPIERROR("Error! short read!\n");
968  return PAPI_ESYS;
969  }
970 
971  SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
972  pe_ctl->events[i].event_fd,
973  (long)pe_ctl->tid, pe_ctl->cpu, ret);
974  SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0],
975  papi_pe_buffer[1],papi_pe_buffer[2]);
976 
977  tot_time_enabled = papi_pe_buffer[1];
978  tot_time_running = papi_pe_buffer[2];
979 
980  SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
981  "tot_time_enabled %lld) / tot_time_running %lld\n",
982  i, 0,papi_pe_buffer[0],
983  tot_time_enabled,tot_time_running);
984 
985  if (tot_time_running == tot_time_enabled) {
986  /* No scaling needed */
987  pe_ctl->counts[i] = papi_pe_buffer[0];
988  } else if (tot_time_running && tot_time_enabled) {
989  /* Scale factor of 100 to avoid overflows when computing */
990  /*enabled/running */
991 
992  scale = (tot_time_enabled * 100LL) / tot_time_running;
993  scale = scale * papi_pe_buffer[0];
994  scale = scale / 100LL;
995  pe_ctl->counts[i] = scale;
996  } else {
997  /* This should not happen, but Phil reports it sometime does. */
998  SUBDBG("perf_event kernel bug(?) count, enabled, "
999  "running: %lld, %lld, %lld\n",
1000  papi_pe_buffer[0],tot_time_enabled,
1001  tot_time_running);
1002 
1003  pe_ctl->counts[i] = papi_pe_buffer[0];
1004  }
1005  }
1006  }
1007 
1008  /* Handle cases where we cannot use FORMAT GROUP */
1009  else if (bug_format_group() || pe_ctl->inherit) {
1010 
1011  /* we must read each counter individually */
1012  for ( i = 0; i < pe_ctl->num_events; i++ ) {
1013 
1014  ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
1015  sizeof ( papi_pe_buffer ) );
1016  if ( ret == -1 ) {
1017  PAPIERROR("read returned an error: ", strerror( errno ));
1018  return PAPI_ESYS;
1019  }
1020 
1021  /* we should read one 64-bit value from each counter */
1022  if (ret!=sizeof(long long)) {
1023  PAPIERROR("Error! short read!\n");
1024  PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
1025  pe_ctl->events[i].event_fd,
1026  (long)pe_ctl->tid, pe_ctl->cpu, ret);
1027  return PAPI_ESYS;
1028  }
1029 
1030  SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
1031  pe_ctl->events[i].event_fd, (long)pe_ctl->tid,
1032  pe_ctl->cpu, ret);
1033  SUBDBG("read: %lld\n",papi_pe_buffer[0]);
1034 
1035  pe_ctl->counts[i] = papi_pe_buffer[0];
1036  }
1037  }
1038 
1039 
1040  /* Handle cases where we are using FORMAT_GROUP */
1041  /* We assume only one group leader, in position 0 */
1042 
1043  else {
1044  if (pe_ctl->events[0].group_leader_fd!=-1) {
1045  PAPIERROR("Was expecting group leader!\n");
1046  }
1047 
1048  ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer,
1049  sizeof ( papi_pe_buffer ) );
1050 
1051  if ( ret == -1 ) {
1052  PAPIERROR("read returned an error: ", strerror( errno ));
1053  return PAPI_ESYS;
1054  }
1055 
1056  /* we read 1 64-bit value (number of events) then */
1057  /* num_events more 64-bit values that hold the counts */
1058  if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
1059  PAPIERROR("Error! short read!\n");
1060  return PAPI_ESYS;
1061  }
1062 
1063  SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
1064  pe_ctl->events[0].event_fd,
1065  (long)pe_ctl->tid, pe_ctl->cpu, ret);
1066  {
1067  int j;
1068  for(j=0;j<ret/8;j++) {
1069  SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
1070  }
1071  }
1072 
1073  /* Make sure the kernel agrees with how many events we have */
1074  if (papi_pe_buffer[0]!=pe_ctl->num_events) {
1075  PAPIERROR("Error! Wrong number of events!\n");
1076  return PAPI_ESYS;
1077  }
1078 
1079  /* put the count values in their proper location */
1080  for(i=0;i<papi_pe_buffer[0];i++) {
1081  pe_ctl->counts[i] = papi_pe_buffer[1+i];
1082  }
1083  }
1084 
1085 
1086  /* If we disabled the counters due to the sync_read_bug(), */
1087  /* then we need to re-enable them now. */
1088  if (bug_sync_read()) {
1089  if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
1090  for ( i = 0; i < pe_ctl->num_events; i++ ) {
1091  if ( pe_ctl->events[i].group_leader_fd == -1 ) {
1092  /* this should refresh any overflow counters too */
1093  ret = ioctl( pe_ctl->events[i].event_fd,
1094  PERF_EVENT_IOC_ENABLE, NULL );
1095  if ( ret == -1 ) {
1096  /* Should never happen */
1097  PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ",
1098  strerror( errno ));
1099  return PAPI_ESYS;
1100  }
1101  }
1102  }
1103  }
1104  }
1105 
1106  /* point PAPI to the values we read */
1107  *events = pe_ctl->counts;
1108 
1109  return PAPI_OK;
1110 }
1111 
1112 /* Start counting events */
1113 int
1115 {
1116  int ret;
1117  int i;
1118  int did_something = 0;
1119  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1120  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1121 
1122  /* Reset the counters first. Is this necessary? */
1123  ret = _pe_reset( pe_ctx, pe_ctl );
1124  if ( ret ) {
1125  return ret;
1126  }
1127 
1128  /* Enable all of the group leaders */
1129  /* All group leaders have a group_leader_fd of -1 */
1130  for( i = 0; i < pe_ctl->num_events; i++ ) {
1131  if (pe_ctl->events[i].group_leader_fd == -1) {
1132  SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd);
1133  ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ;
1134 
1135  /* ioctls always return -1 on failure */
1136  if (ret == -1) {
1137  PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
1138  return PAPI_ESYS;
1139  }
1140 
1141  did_something++;
1142  }
1143  }
1144 
1145  if (!did_something) {
1146  PAPIERROR("Did not enable any counters.\n");
1147  return PAPI_EBUG;
1148  }
1149 
1150  pe_ctx->state |= PERF_EVENTS_RUNNING;
1151 
1152  return PAPI_OK;
1153 
1154 }
1155 
1156 /* Stop all of the counters */
1157 int
1159 {
1160 
1161  int ret;
1162  int i;
1163  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1164  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1165 
1166  /* Just disable the group leaders */
1167  for ( i = 0; i < pe_ctl->num_events; i++ ) {
1168  if ( pe_ctl->events[i].group_leader_fd == -1 ) {
1169  ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL);
1170  if ( ret == -1 ) {
1171  PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
1172  "returned error, Linux says: %s",
1173  pe_ctl->events[i].event_fd, strerror( errno ) );
1174  return PAPI_EBUG;
1175  }
1176  }
1177  }
1178 
1179  pe_ctx->state &= ~PERF_EVENTS_RUNNING;
1180 
1181  return PAPI_OK;
1182 }
1183 
1184 /* This function clears the current contents of the control structure and
1185  updates it with whatever resources are allocated for all the native events
1186  in the native info structure array. */
1187 
1188 int
1191  int count, hwd_context_t *ctx )
1192 {
1193  int i = 0, ret;
1194  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1195  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1196 
1197  /* close all of the existing fds and start over again */
1198  /* In theory we could have finer-grained control and know if */
1199  /* things were changed, but it's easier to tear things down and rebuild. */
1200  close_pe_events( pe_ctx, pe_ctl );
1201 
1202  /* Calling with count==0 should be OK, it's how things are deallocated */
1203  /* when an eventset is destroyed. */
1204  if ( count == 0 ) {
1205  SUBDBG( "Called with count == 0\n" );
1206  return PAPI_OK;
1207  }
1208 
1209  /* set up all the events */
1210  for( i = 0; i < count; i++ ) {
1211  if ( native ) {
1212  /* Have libpfm4 set the config values for the event */
1214  native[i].ni_event,
1215  pe_ctx->event_table);
1216  SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i,
1217  pe_ctl->events[i].attr.config);
1218  if (ret!=PAPI_OK) return ret;
1219 
1220  } else {
1221  /* I'm not sure how we'd end up in this case */
1222  /* should it be an error? */
1223  }
1224 
1225  /* Copy the inherit flag into the attribute block that will be */
1226  /* passed to the kernel */
1227  pe_ctl->events[i].attr.inherit = pe_ctl->inherit;
1228 
1229  /* Set the position in the native structure */
1230  /* We just set up events linearly */
1231  if ( native ) {
1232  native[i].ni_position = i;
1233  }
1234  }
1235 
1236  pe_ctl->num_events = count;
1237  _pe_set_domain( ctl, pe_ctl->domain );
1238 
1239  /* actuall open the events */
1240  /* (why is this a separate function?) */
1241  ret = open_pe_events( pe_ctx, pe_ctl );
1242  if ( ret != PAPI_OK ) {
1243  SUBDBG("open_pe_events failed\n");
1244  /* Restore values ? */
1245  return ret;
1246  }
1247 
1248  return PAPI_OK;
1249 }
1250 
1251 /* Set various options on a control state */
1252 int
1253 _pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
1254 {
1255  int ret;
1256  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1257  pe_control_t *pe_ctl = NULL;
1258 
1259  switch ( code ) {
1260  case PAPI_MULTIPLEX:
1261  pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );
1262  ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
1263  pe_ctl->granularity,
1264  1, pe_ctl->inherit );
1265  if (ret != PAPI_OK) {
1266  return ret;
1267  }
1268 
1269  /* looks like we are allowed, so set multiplexed attribute */
1270  pe_ctl->multiplexed = 1;
1271  ret = _pe_update_control_state( pe_ctl, NULL,
1272  pe_ctl->num_events, pe_ctx );
1273  if (ret != PAPI_OK) {
1274  pe_ctl->multiplexed = 0;
1275  }
1276  return ret;
1277 
1278  case PAPI_ATTACH:
1279  pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );
1280  ret = check_permissions( option->attach.tid, pe_ctl->cpu,
1281  pe_ctl->domain, pe_ctl->granularity,
1282  pe_ctl->multiplexed,
1283  pe_ctl->inherit );
1284  if (ret != PAPI_OK) {
1285  return ret;
1286  }
1287 
1288  pe_ctl->tid = option->attach.tid;
1289 
1290  /* If events have been already been added, something may */
1291  /* have been done to the kernel, so update */
1292  ret =_pe_update_control_state( pe_ctl, NULL,
1293  pe_ctl->num_events, pe_ctx);
1294 
1295  return ret;
1296 
1297  case PAPI_DETACH:
1298  pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );
1299 
1300  pe_ctl->tid = 0;
1301  return PAPI_OK;
1302 
1303  case PAPI_CPU_ATTACH:
1304  pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );
1305  ret = check_permissions( pe_ctl->tid, option->cpu.cpu_num,
1306  pe_ctl->domain, pe_ctl->granularity,
1307  pe_ctl->multiplexed,
1308  pe_ctl->inherit );
1309  if (ret != PAPI_OK) {
1310  return ret;
1311  }
1312  /* looks like we are allowed so set cpu number */
1313 
1314  /* this tells the kernel not to count for a thread */
1315  /* should we warn if we try to set both? perf_event */
1316  /* will reject it. */
1317  pe_ctl->tid = -1;
1318 
1319  pe_ctl->cpu = option->cpu.cpu_num;
1320 
1321  return PAPI_OK;
1322 
1323  case PAPI_DOMAIN:
1324  pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );
1325  ret = check_permissions( pe_ctl->tid, pe_ctl->cpu,
1326  option->domain.domain,
1327  pe_ctl->granularity,
1328  pe_ctl->multiplexed,
1329  pe_ctl->inherit );
1330  if (ret != PAPI_OK) {
1331  return ret;
1332  }
1333  /* looks like we are allowed, so set counting domain */
1334  return _pe_set_domain( pe_ctl, option->domain.domain );
1335 
1336  case PAPI_GRANUL:
1337  pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );
1338 
1339  /* FIXME: we really don't support this yet */
1340 
1341  switch ( option->granularity.granularity ) {
1342  case PAPI_GRN_PROCG:
1343  case PAPI_GRN_SYS_CPU:
1344  case PAPI_GRN_PROC:
1345  return PAPI_ECMP;
1346 
1347  /* Currently we only support thread and CPU granularity */
1348  case PAPI_GRN_SYS:
1349  pe_ctl->granularity=PAPI_GRN_SYS;
1350  break;
1351 
1352  case PAPI_GRN_THR:
1353  pe_ctl->granularity=PAPI_GRN_THR;
1354  break;
1355 
1356 
1357  default:
1358  return PAPI_EINVAL;
1359  }
1360  return PAPI_OK;
1361 
1362  case PAPI_INHERIT:
1363  pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );
1364  ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
1365  pe_ctl->granularity, pe_ctl->multiplexed,
1366  option->inherit.inherit );
1367  if (ret != PAPI_OK) {
1368  return ret;
1369  }
1370  /* looks like we are allowed, so set the requested inheritance */
1371  if (option->inherit.inherit) {
1372  /* children will inherit counters */
1373  pe_ctl->inherit = 1;
1374  } else {
1375  /* children won't inherit counters */
1376  pe_ctl->inherit = 0;
1377  }
1378  return PAPI_OK;
1379 
1380  case PAPI_DATA_ADDRESS:
1381  return PAPI_ENOSUPP;
1382 #if 0
1383  pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
1384  ret = set_default_domain( pe_ctl, option->address_range.domain );
1385  if ( ret != PAPI_OK ) {
1386  return ret;
1387  }
1388  set_drange( pe_ctx, pe_ctl, option );
1389  return PAPI_OK;
1390 #endif
1391  case PAPI_INSTR_ADDRESS:
1392  return PAPI_ENOSUPP;
1393 #if 0
1394  pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
1395  ret = set_default_domain( pe_ctl, option->address_range.domain );
1396  if ( ret != PAPI_OK ) {
1397  return ret;
1398  }
1399  set_irange( pe_ctx, pe_ctl, option );
1400  return PAPI_OK;
1401 #endif
1402 
1403  case PAPI_DEF_ITIMER:
1404  /* What should we be checking for here? */
1405  /* This seems like it should be OS-specific not component */
1406  /* specific. */
1407 
1408  return PAPI_OK;
1409 
1410  case PAPI_DEF_MPX_NS:
1411  /* Defining a given ns per set is not current supported */
1412  return PAPI_ENOSUPP;
1413 
1414  case PAPI_DEF_ITIMER_NS:
1415  /* We don't support this... */
1416  return PAPI_OK;
1417 
1418  default:
1419  return PAPI_ENOSUPP;
1420  }
1421 }
1422 
1423 /* Initialize a thread */
1424 int
1426 {
1427 
1428  pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;
1429 
1430  /* clear the context structure and mark as initialized */
1431  memset( pe_ctx, 0, sizeof ( pe_context_t ) );
1432  pe_ctx->initialized=1;
1434  pe_ctx->cidx=our_cidx;
1435 
1436  return PAPI_OK;
1437 }
1438 
1439 /* Initialize a new control state */
1440 int
1442 {
1443  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1444 
1445  /* clear the contents */
1446  memset( pe_ctl, 0, sizeof ( pe_control_t ) );
1447 
1448  /* Set the domain */
1449  _pe_set_domain( ctl, _perf_event_vector.cmp_info.default_domain );
1450 
1451  /* default granularity */
1452  pe_ctl->granularity= _perf_event_vector.cmp_info.default_granularity;
1453 
1454  /* overflow signal */
1455  pe_ctl->overflow_signal=_perf_event_vector.cmp_info.hardware_intr_sig;
1456 
1457  pe_ctl->cidx=our_cidx;
1458 
1459  /* Set cpu number in the control block to show events */
1460  /* are not tied to specific cpu */
1461  pe_ctl->cpu = -1;
1462  return PAPI_OK;
1463 }
1464 
1465 /* Check the mmap page for rdpmc support */
1466 static int _pe_detect_rdpmc(int default_domain) {
1467 
1468  struct perf_event_attr pe;
1469  int fd,rdpmc_exists=1;
1470  void *addr;
1471  struct perf_event_mmap_page *our_mmap;
1472 
1473  /* Create a fake instructions event so we can read a mmap page */
1474  memset(&pe,0,sizeof(struct perf_event_attr));
1475 
1476  pe.type=PERF_TYPE_HARDWARE;
1477  pe.size=sizeof(struct perf_event_attr);
1478  pe.config=PERF_COUNT_HW_INSTRUCTIONS;
1479 
1480  /* There should probably be a helper function to handle this */
1481  /* we break on some ARM because there is no support for excluding */
1482  /* kernel. */
1483  if (default_domain & PAPI_DOM_KERNEL ) {
1484  }
1485  else {
1486  pe.exclude_kernel=1;
1487  }
1488  fd=sys_perf_event_open(&pe,0,-1,-1,0);
1489  if (fd<0) {
1490  return PAPI_ESYS;
1491  }
1492 
1493  /* create the mmap page */
1494  addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0);
1495  if (addr == (void *)(-1)) {
1496  close(fd);
1497  return PAPI_ESYS;
1498  }
1499 
1500  /* get the rdpmc info */
1501  our_mmap=(struct perf_event_mmap_page *)addr;
1502  if (our_mmap->cap_usr_rdpmc==0) {
1503  rdpmc_exists=0;
1504  }
1505 
1506  /* close the fake event */
1507  munmap(addr,4096);
1508  close(fd);
1509 
1510  return rdpmc_exists;
1511 
1512 }
1513 
1514 
1515 /* Initialize the perf_event component */
1516 int
1518 {
1519 
1520  int retval;
1521  int paranoid_level;
1522 
1523  FILE *fff;
1524 
1525  our_cidx=cidx;
1526 
1527  /* The is the official way to detect if perf_event support exists */
1528  /* The file is called perf_counter_paranoid on 2.6.31 */
1529  /* currently we are lazy and do not support 2.6.31 kernels */
1530  fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
1531  if (fff==NULL) {
1532  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1533  "perf_event support not detected",PAPI_MAX_STR_LEN);
1534  return PAPI_ENOCMP;
1535  }
1536 
1537  /* 2 means no kernel measurements allowed */
1538  /* 1 means normal counter access */
1539  /* 0 means you can access CPU-specific data */
1540  /* -1 means no restrictions */
1541  retval=fscanf(fff,"%d",&paranoid_level);
1542  if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
1543  fclose(fff);
1544 
1545  if ((paranoid_level==2) && (getuid()!=0)) {
1546  SUBDBG("/proc/sys/kernel/perf_event_paranoid prohibits kernel counts");
1547  _papi_hwd[cidx]->cmp_info.available_domains &=~PAPI_DOM_KERNEL;
1548  }
1549 
1550  /* Detect NMI watchdog which can steal counters */
1552  if (nmi_watchdog_active) {
1553  SUBDBG("The Linux nmi_watchdog is using one of the performance "
1554  "counters, reducing the total number available.\n");
1555  }
1556  /* Kernel multiplexing is broken prior to kernel 2.6.34 */
1557  /* The fix was probably git commit: */
1558  /* 45e16a6834b6af098702e5ea6c9a40de42ff77d8 */
1559  if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) {
1560  _papi_hwd[cidx]->cmp_info.kernel_multiplex = 0;
1561  _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PAPI_MAX_SW_MPX_EVENTS;
1562  }
1563  else {
1564  _papi_hwd[cidx]->cmp_info.kernel_multiplex = 1;
1565  _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS;
1566  }
1567 
1568  /* Check that processor is supported */
1571  PAPI_OK) {
1572  fprintf(stderr,"warning, your processor is unsupported\n");
1573  /* should not return error, as software events should still work */
1574  }
1575 
1576  /* Setup mmtimers, if appropriate */
1577  retval=mmtimer_setup();
1578  if (retval) {
1579  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1580  "Error initializing mmtimer",PAPI_MAX_STR_LEN);
1581  return retval;
1582  }
1583 
1584  /* Set the overflow signal */
1585  _papi_hwd[cidx]->cmp_info.hardware_intr_sig = SIGRTMIN + 2;
1586 
1587  /* Run Vendor-specific fixups */
1588  pe_vendor_fixups(_papi_hwd[cidx]);
1589 
1590  /* Detect if we can use rdpmc (or equivalent) */
1591  /* We currently do not use rdpmc as it is slower in tests */
1592  /* than regular read (as of Linux 3.5) */
1593  retval=_pe_detect_rdpmc(_papi_hwd[cidx]->cmp_info.default_domain);
1594  if (retval < 0 ) {
1595  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1596  "sys_perf_event_open() failed, perf_event support for this platform may be broken",PAPI_MAX_STR_LEN);
1597 
1598  return retval;
1599  }
1600  _papi_hwd[cidx]->cmp_info.fast_counter_read = retval;
1601 
1602  /* Run the libpfm4-specific setup */
1603  retval = _papi_libpfm4_init(_papi_hwd[cidx]);
1604  if (retval) {
1605  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1606  "Error initializing libpfm4",PAPI_MAX_STR_LEN);
1607  return retval;
1608  }
1609 
1610  retval = _pe_libpfm4_init(_papi_hwd[cidx], cidx,
1613  if (retval) {
1614  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1615  "Error initializing libpfm4",PAPI_MAX_STR_LEN);
1616  return retval;
1617  }
1618 
1619  return PAPI_OK;
1620 
1621 }
1622 
1623 /* Shutdown the perf_event component */
1624 int
1626 
1627  /* deallocate our event table */
1629 
1630  /* Shutdown libpfm4 */
1632 
1633  return PAPI_OK;
1634 }
1635 
1636 
1637 
1638 
1639 int
1640 _pe_ntv_enum_events( unsigned int *PapiEventCode, int modifier )
1641 {
1642  return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier,
1644 }
1645 
1646 int
1647 _pe_ntv_name_to_code( char *name, unsigned int *event_code) {
1648  return _pe_libpfm4_ntv_name_to_code(name,event_code,
1650 }
1651 
1652 int
1653 _pe_ntv_code_to_name(unsigned int EventCode,
1654  char *ntv_name, int len) {
1655  return _pe_libpfm4_ntv_code_to_name(EventCode,
1656  ntv_name, len,
1658 }
1659 
1660 int
1661 _pe_ntv_code_to_descr( unsigned int EventCode,
1662  char *ntv_descr, int len) {
1663 
1664  return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len,
1666 }
1667 
1668 int
1669 _pe_ntv_code_to_info(unsigned int EventCode,
1670  PAPI_event_info_t *info) {
1671 
1672  return _pe_libpfm4_ntv_code_to_info(EventCode, info,
1674 }
1675 
1676 /* These functions are based on builtin-record.c in the */
1677 /* kernel's tools/perf directory. */
1678 
1679 static uint64_t
1681 {
1682  struct perf_event_mmap_page *pc = pe->mmap_buf;
1683  int head;
1684 
1685  if ( pc == NULL ) {
1686  PAPIERROR( "perf_event_mmap_page is NULL" );
1687  return 0;
1688  }
1689 
1690  head = pc->data_head;
1691  rmb( );
1692 
1693  return head;
1694 }
1695 
1696 static void
1698 {
1699  struct perf_event_mmap_page *pc = pe->mmap_buf;
1700 
1701  /* ensure all reads are done before we write the tail out. */
1702  pc->data_tail = tail;
1703 }
1704 
1705 
1706 /* Does the kernel define these somewhere? */
1707 struct ip_event {
1708  struct perf_event_header header;
1709  uint64_t ip;
1710 };
1711 struct lost_event {
1712  struct perf_event_header header;
1713  uint64_t id;
1714  uint64_t lost;
1715 };
1716 typedef union event_union {
1717  struct perf_event_header header;
1718  struct ip_event ip;
1721 
1722 /* Should re-write with comments if we ever figure out what's */
1723 /* going on here. */
1724 static void
1726  int profile_index )
1727 {
1728  uint64_t head = mmap_read_head( pe );
1729  uint64_t old = pe->tail;
1730  unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( );
1731  int diff;
1732 
1733  diff = head - old;
1734  if ( diff < 0 ) {
1735  SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
1736  ", tail = %" PRIu64 ". Discarding samples.\n", head, old );
1737  /* head points to a known good entry, start there. */
1738  old = head;
1739  }
1740 
1741  for( ; old != head; ) {
1743  & data[old & pe->mask];
1744  perf_sample_event_t event_copy;
1745  size_t size = event->header.size;
1746 
1747  /* Event straddles the mmap boundary -- header should always */
1748  /* be inside due to u64 alignment of output. */
1749  if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
1750  uint64_t offset = old;
1751  uint64_t len = min( sizeof ( *event ), size ), cpy;
1752  void *dst = &event_copy;
1753 
1754  do {
1755  cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
1756  memcpy( dst, &data[offset & pe->mask], cpy );
1757  offset += cpy;
1758  dst = ((unsigned char*)dst) + cpy;
1759  len -= cpy;
1760  } while ( len );
1761 
1762  event = &event_copy;
1763  }
1764  old += size;
1765 
1766  SUBDBG( "event->type = %08x\n", event->header.type );
1767  SUBDBG( "event->size = %d\n", event->header.size );
1768 
1769  switch ( event->header.type ) {
1770  case PERF_RECORD_SAMPLE:
1771  _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
1772  ( caddr_t ) ( unsigned long ) event->ip.ip,
1773  0, profile_index );
1774  break;
1775 
1776  case PERF_RECORD_LOST:
1777  SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
1778  " events were lost.\n"
1779  "Loss was recorded when counter id %#"PRIx64
1780  " overflowed.\n", event->lost.lost, event->lost.id );
1781  break;
1782 
1783  default:
1784  SUBDBG( "Error: unexpected header type - %d\n",
1785  event->header.type );
1786  break;
1787  }
1788  }
1789 
1790  pe->tail = old;
1791  mmap_write_tail( pe, old );
1792 }
1793 
1794 /* Find a native event specified by a profile index */
1795 static int
1796 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags,
1797  unsigned int *native_index, int *profile_index )
1798 {
1799  int pos, esi_index, count;
1800 
1801  for ( count = 0; count < ESI->profile.event_counter; count++ ) {
1802  esi_index = ESI->profile.EventIndex[count];
1803  pos = ESI->EventInfoArray[esi_index].pos[0];
1804 
1805  if ( pos == evt_idx ) {
1806  *profile_index = count;
1807  *native_index = ESI->NativeInfoArray[pos].ni_event &
1809  *flags = ESI->profile.flags;
1810  SUBDBG( "Native event %d is at profile index %d, flags %d\n",
1811  *native_index, *profile_index, *flags );
1812  return PAPI_OK;
1813  }
1814  }
1815  PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count,
1816  ESI->profile.event_counter );
1817  return PAPI_EBUG;
1818 }
1819 
1820 
1821 
1822 /* What exactly does this do? */
1823 static int
1824 process_smpl_buf( int evt_idx, ThreadInfo_t **thr, int cidx )
1825 {
1826  int ret, flags, profile_index;
1827  unsigned native_index;
1828  pe_control_t *ctl;
1829 
1830  ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx,
1831  &flags, &native_index, &profile_index );
1832  if ( ret != PAPI_OK ) {
1833  return ret;
1834  }
1835 
1836  ctl= (*thr)->running_eventset[cidx]->ctl_state;
1837 
1838  mmap_read( cidx, thr,
1839  &(ctl->events[evt_idx]),
1840  profile_index );
1841 
1842  return PAPI_OK;
1843 }
1844 
1845 /*
1846  * This function is used when hardware overflows are working or when
1847  * software overflows are forced
1848  */
1849 
1850 void
1851 _pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc)
1852 {
1853  ( void ) n; /*unused */
1854  _papi_hwi_context_t hw_context;
1855  int found_evt_idx = -1, fd = info->si_fd;
1856  caddr_t address;
1858  int i;
1859  pe_control_t *ctl;
1860  int cidx = _perf_event_vector.cmp_info.CmpIdx;
1861 
1862  if ( thread == NULL ) {
1863  PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd );
1864  return;
1865  }
1866 
1867  if ( thread->running_eventset[cidx] == NULL ) {
1868  PAPIERROR( "thread->running_eventset == NULL in "
1869  "_papi_pe_dispatch_timer for fd %d!",fd );
1870  return;
1871  }
1872 
1873  if ( thread->running_eventset[cidx]->overflow.flags == 0 ) {
1874  PAPIERROR( "thread->running_eventset->overflow.flags == 0 in "
1875  "_papi_pe_dispatch_timer for fd %d!", fd );
1876  return;
1877  }
1878 
1879  hw_context.si = info;
1880  hw_context.ucontext = ( hwd_ucontext_t * ) uc;
1881 
1882  if ( thread->running_eventset[cidx]->overflow.flags &
1884  address = GET_OVERFLOW_ADDRESS( hw_context );
1885  _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
1886  address, NULL, 0,
1887  0, &thread, cidx );
1888  return;
1889  }
1890 
1891  if ( thread->running_eventset[cidx]->overflow.flags !=
1893  PAPIERROR( "thread->running_eventset->overflow.flags is set to "
1894  "something other than PAPI_OVERFLOW_HARDWARE or "
1895  "PAPI_OVERFLOW_FORCE_SW for fd %d (%#x)",
1896  fd , thread->running_eventset[cidx]->overflow.flags);
1897  }
1898 
1899  /* convoluted way to get ctl */
1900  ctl= thread->running_eventset[cidx]->ctl_state;
1901 
1902  /* See if the fd is one that's part of the this thread's context */
1903  for( i=0; i < ctl->num_events; i++ ) {
1904  if ( fd == ctl->events[i].event_fd ) {
1905  found_evt_idx = i;
1906  break;
1907  }
1908  }
1909 
1910  if ( found_evt_idx == -1 ) {
1911  PAPIERROR( "Unable to find fd %d among the open event fds "
1912  "_papi_hwi_dispatch_timer!", fd );
1913  return;
1914  }
1915 
1916  if (ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ) == -1 ) {
1917  PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) failed.\n");
1918  }
1919 
1920  if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) &&
1921  !( thread->running_eventset[cidx]->profile.flags &
1922  PAPI_PROFIL_FORCE_SW ) ) {
1923  process_smpl_buf( found_evt_idx, &thread, cidx );
1924  }
1925  else {
1926  uint64_t ip;
1927  unsigned int head;
1928  pe_event_info_t *pe = &(ctl->events[found_evt_idx]);
1929  unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( );
1930 
1931  /*
1932  * Read up the most recent IP from the sample in the mmap buffer. To
1933  * do this, we make the assumption that all of the records in the
1934  * mmap buffer are the same size, and that they all contain the IP as
1935  * their only record element. This means that we can use the
1936  * data_head element from the user page and move backward one record
1937  * from that point and read the data. Since we don't actually need
1938  * to access the header of the record, we can just subtract 8 (size
1939  * of the IP) from data_head and read up that word from the mmap
1940  * buffer. After we subtract 8, we account for mmap buffer wrapping
1941  * by AND'ing this offset with the buffer mask.
1942  */
1943  head = mmap_read_head( pe );
1944 
1945  if ( head == 0 ) {
1946  PAPIERROR( "Attempting to access memory which may be inaccessable" );
1947  return;
1948  }
1949  ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) );
1950  /*
1951  * Update the tail to the current head pointer.
1952  *
1953  * Note: that if we were to read the record at the tail pointer,
1954  * rather than the one at the head (as you might otherwise think
1955  * would be natural), we could run into problems. Signals don't
1956  * stack well on Linux, particularly if not using RT signals, and if
1957  * they come in rapidly enough, we can lose some. Overtime, the head
1958  * could catch up to the tail and monitoring would be stopped, and
1959  * since no more signals are coming in, this problem will never be
1960  * resolved, resulting in a complete loss of overflow notification
1961  * from that point on. So the solution we use here will result in
1962  * only the most recent IP value being read every time there are two
1963  * or more samples in the buffer (for that one overflow signal). But
1964  * the handler will always bring up the tail, so the head should
1965  * never run into the tail.
1966  */
1967  mmap_write_tail( pe, head );
1968 
1969  /*
1970  * The fourth parameter is supposed to be a vector of bits indicating
1971  * the overflowed hardware counters, but it's not really clear that
1972  * it's useful, because the actual hardware counters used are not
1973  * exposed to the PAPI user. For now, I'm just going to set the bit
1974  * that indicates which event register in the array overflowed. The
1975  * result is that the overflow vector will not be identical to the
1976  * perfmon implementation, and part of that is due to the fact that
1977  * which hardware register is actually being used is opaque at the
1978  * user level (the kernel event dispatcher hides that info).
1979  */
1980 
1981  _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
1982  ( caddr_t ) ( unsigned long ) ip,
1983  NULL, ( 1 << found_evt_idx ), 0,
1984  &thread, cidx );
1985 
1986  }
1987 
1988  /* Restart the counters */
1989  if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) {
1990  PAPIERROR( "overflow refresh failed", 0 );
1991  }
1992 }
1993 
1994 /* Stop profiling */
1995 int
1997 {
1998  int i, ret = PAPI_OK;
1999  pe_control_t *ctl;
2000  int cidx;
2001 
2002  ctl=ESI->ctl_state;
2003 
2004  cidx=ctl->cidx;
2005 
2006  /* Loop through all of the events and process those which have mmap */
2007  /* buffers attached. */
2008  for ( i = 0; i < ctl->num_events; i++ ) {
2009  /* Use the mmap_buf field as an indicator of this fd being used for */
2010  /* profiling. */
2011  if ( ctl->events[i].mmap_buf ) {
2012  /* Process any remaining samples in the sample buffer */
2013  ret = process_smpl_buf( i, &thread, cidx );
2014  if ( ret ) {
2015  PAPIERROR( "process_smpl_buf returned error %d", ret );
2016  return ret;
2017  }
2018  }
2019  }
2020  return ret;
2021 }
2022 
2023 /* Setup an event to cause overflow */
2024 int
2025 _pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold )
2026 {
2027 
2028  pe_context_t *ctx;
2029  pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state );
2030  int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK;
2031  int cidx;
2032 
2033  cidx = ctl->cidx;
2034  ctx = ( pe_context_t *) ( ESI->master->context[cidx] );
2035 
2036  evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
2037 
2038  SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n",
2039  evt_idx,EventIndex,ESI->EventSetIndex);
2040 
2041  if (evt_idx<0) {
2042  return PAPI_EINVAL;
2043  }
2044 
2045  if ( threshold == 0 ) {
2046  /* If this counter isn't set to overflow, it's an error */
2047  if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL;
2048  }
2049 
2050  ctl->events[evt_idx].attr.sample_period = threshold;
2051 
2052  /*
2053  * Note that the wakeup_mode field initially will be set to zero
2054  * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to
2055  * all of the events in the ctl struct.
2056  *
2057  * Is it even set to any other value elsewhere?
2058  */
2059  switch ( ctl->events[evt_idx].wakeup_mode ) {
2060  case WAKEUP_MODE_PROFILING:
2061  /* Setting wakeup_events to special value zero means issue a */
2062  /* wakeup (signal) on every mmap page overflow. */
2063  ctl->events[evt_idx].attr.wakeup_events = 0;
2064  break;
2065 
2067  /* Can this code ever be called? */
2068 
2069  /* Setting wakeup_events to one means issue a wakeup on every */
2070  /* counter overflow (not mmap page overflow). */
2071  ctl->events[evt_idx].attr.wakeup_events = 1;
2072  /* We need the IP to pass to the overflow handler */
2073  ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP;
2074  /* one for the user page, and two to take IP samples */
2075  ctl->events[evt_idx].nr_mmap_pages = 1 + 2;
2076  break;
2077  default:
2078  PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u",
2079  evt_idx, ctl->events[evt_idx].wakeup_mode);
2080  return PAPI_EBUG;
2081  }
2082 
2083  /* Check for non-zero sample period */
2084  for ( i = 0; i < ctl->num_events; i++ ) {
2085  if ( ctl->events[evt_idx].attr.sample_period ) {
2086  found_non_zero_sample_period = 1;
2087  break;
2088  }
2089  }
2090 
2091  if ( found_non_zero_sample_period ) {
2092  /* turn on internal overflow flag for this event set */
2093  ctl->overflow = 1;
2094 
2095  /* Enable the signal handler */
2097  ctl->overflow_signal,
2098  1, ctl->cidx );
2099  } else {
2100  /* turn off internal overflow flag for this event set */
2101  ctl->overflow = 0;
2102 
2103  /* Remove the signal handler, if there are no remaining non-zero */
2104  /* sample_periods set */
2106  if ( retval != PAPI_OK ) return retval;
2107  }
2108 
2109  retval = _pe_update_control_state( ctl, NULL,
2110  ( (pe_control_t *) (ESI->ctl_state) )->num_events,
2111  ctx );
2112 
2113  return retval;
2114 }
2115 
2116 /* Enable profiling */
2117 int
2118 _pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold )
2119 {
2120  int ret;
2121  int evt_idx;
2122  pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state );
2123 
2124  /* Since you can't profile on a derived event, the event is always the */
2125  /* first and only event in the native event list. */
2126  evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
2127 
2128  if ( threshold == 0 ) {
2129  SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf,
2130  ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages *
2131  getpagesize( ) );
2132 
2133  if ( ctl->events[evt_idx].mmap_buf ) {
2134  munmap( ctl->events[evt_idx].mmap_buf,
2135  ctl->events[evt_idx].nr_mmap_pages * getpagesize() );
2136  }
2137  ctl->events[evt_idx].mmap_buf = NULL;
2138  ctl->events[evt_idx].nr_mmap_pages = 0;
2139  ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP;
2140  ret = _pe_set_overflow( ESI, EventIndex, threshold );
2141  /* ??? #warning "This should be handled somewhere else" */
2142  ESI->state &= ~( PAPI_OVERFLOWING );
2143  ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE );
2144 
2145  return ret;
2146  }
2147 
2148  /* Look up the native event code */
2150  /* Not supported yet... */
2151 
2152  return PAPI_ENOSUPP;
2153  }
2154  if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) {
2155  /* This requires an ability to randomly alter the sample_period within */
2156  /* a given range. Kernel does not have this ability. FIXME */
2157  return PAPI_ENOSUPP;
2158  }
2159 
2160  /* Just a guess at how many pages would make this relatively efficient. */
2161  /* Note that it's "1 +" because of the need for a control page, and the */
2162  /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or */
2163  /* zero. This is required to optimize dealing with circular buffer */
2164  /* wrapping of the mapped pages. */
2165 
2166  ctl->events[evt_idx].nr_mmap_pages = (1+8);
2167  ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP;
2168 
2169  ret = _pe_set_overflow( ESI, EventIndex, threshold );
2170  if ( ret != PAPI_OK ) return ret;
2171 
2172  return PAPI_OK;
2173 }
2174 
2175 
2176 /* Our component vector */
2177 
2178 papi_vector_t _perf_event_vector = {
2179  .cmp_info = {
2180  /* component information (unspecified values initialized to 0) */
2181  .name = "perf_event",
2182  .short_name = "perf",
2183  .version = "5.0",
2184  .description = "Linux perf_event CPU counters",
2185 
2186  .default_domain = PAPI_DOM_USER,
2187  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR,
2188  .default_granularity = PAPI_GRN_THR,
2189  .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS,
2190 
2191  .hardware_intr = 1,
2192  .kernel_profile = 1,
2193 
2194  /* component specific cmp_info initializations */
2195  .fast_virtual_timer = 0,
2196  .attach = 1,
2197  .attach_must_ptrace = 1,
2198  .cpu = 1,
2199  .inherit = 1,
2200  .cntr_umasks = 1,
2201 
2202  },
2203 
2204  /* sizes of framework-opaque component-private structures */
2205  .size = {
2206  .context = sizeof ( pe_context_t ),
2207  .control_state = sizeof ( pe_control_t ),
2208  .reg_value = sizeof ( int ),
2209  .reg_alloc = sizeof ( int ),
2210  },
2211 
2212  /* function pointers in this component */
2213  .init_component = _pe_init_component,
2214  .shutdown_component = _pe_shutdown_component,
2215  .init_thread = _pe_init_thread,
2216  .init_control_state = _pe_init_control_state,
2217  .dispatch_timer = _pe_dispatch_timer,
2218 
2219  /* function pointers from the shared perf_event lib */
2220  .start = _pe_start,
2221  .stop = _pe_stop,
2222  .read = _pe_read,
2223  .shutdown_thread = _pe_shutdown_thread,
2224  .ctl = _pe_ctl,
2225  .update_control_state = _pe_update_control_state,
2226  .set_domain = _pe_set_domain,
2227  .reset = _pe_reset,
2228  .set_overflow = _pe_set_overflow,
2229  .set_profile = _pe_set_profile,
2230  .stop_profiling = _pe_stop_profiling,
2231  .write = _pe_write,
2232 
2233 
2234  /* from counter name mapper */
2235  .ntv_enum_events = _pe_ntv_enum_events,
2236  .ntv_name_to_code = _pe_ntv_name_to_code,
2237  .ntv_code_to_name = _pe_ntv_code_to_name,
2238  .ntv_code_to_descr = _pe_ntv_code_to_descr,
2239  .ntv_code_to_info = _pe_ntv_code_to_info,
2240 };
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:625
int _pe_libpfm4_setup_counters(struct perf_event_attr *attr, int event, struct native_event_table_t *event_table)
i inherit inherit
void _pe_dispatch_timer(int n, hwd_siginfo_t *info, void *uc)
Definition: perf_event.c:1851
ssize_t read(int fd, void *buf, size_t count)
Definition: appio.c:225
memset(eventId, 0, size)
long long counts[PERF_EVENT_MAX_MPX_COUNTERS]
int _pe_shutdown_thread(hwd_context_t *ctx)
Definition: perf_event.c:846
_papi_int_inherit_t inherit
static int process_smpl_buf(int evt_idx, ThreadInfo_t **thr, int cidx)
Definition: perf_event.c:1824
int errno
int close(int fd)
Definition: appio.c:175
#define PAPI_ENOMEM
Definition: fpapi.h:107
#define PAPI_GRN_SYS_CPU
Definition: fpapi.h:72
int _pe_stop(hwd_context_t *ctx, hwd_control_state_t *ctl)
Definition: perf_event.c:1158
#define PAPI_CPU_ATTACH
Definition: papi.h:455
int _pe_ntv_code_to_descr(unsigned int EventCode, char *ntv_descr, int len)
Definition: perf_event.c:1661
int _pe_reset(hwd_context_t *ctx, hwd_control_state_t *ctl)
Definition: perf_event.c:860
#define PERF_EVENT_MAX_MPX_COUNTERS
Definition: perf_event_lib.h:5
EventSetInfo_t * ESI
static int close_pe_events(pe_context_t *ctx, pe_control_t *ctl)
Definition: perf_event.c:724
int _pe_libpfm4_ntv_enum_events(unsigned int *PapiEventCode, int modifier, struct native_event_table_t *event_table)
unsigned int granularity
long long flags
Definition: iozone.c:12330
#define PAPI_DEF_ITIMER_NS
Definition: papi.h:453
int _pe_libpfm4_ntv_name_to_code(char *name, unsigned int *event_code, struct native_event_table_t *event_table)
int _pe_ntv_enum_events(unsigned int *PapiEventCode, int modifier)
Definition: perf_event.c:1640
EventSetInfo_t * ESI
int _papi_libpfm4_init(papi_vector_t *my_vector)
struct in_addr * ip
Definition: iozone.c:20416
int _pe_set_domain(hwd_control_state_t *ctl, int domain)
Definition: perf_event.c:821
#define PAPI_INSTR_ADDRESS
Definition: papi.h:451
gc head
Definition: libasync.c:669
#define PAPI_MAX_STR_LEN
Definition: fpapi.h:43
#define PAPI_PROFIL_DATA_EAR
Definition: papi.h:402
cpu
Definition: iozone.c:3872
_papi_int_addr_range_t address_range
static int bug_check_scheduability(void)
Definition: perf_event.c:158
#define READ_BUFFER_SIZE
Definition: perf_event.c:443
static long sys_perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
Definition: perf_event.c:298
int _pe_ntv_code_to_info(unsigned int EventCode, PAPI_event_info_t *info)
Definition: perf_event.c:1669
int default_granularity
Definition: papi.h:641
unsigned int wakeup_mode
#define PAPI_ENOEVNT
Definition: fpapi.h:112
off64_t offset
Definition: iozone.c:1279
#define PAPI_DATA_ADDRESS
Definition: papi.h:450
int _pe_libpfm4_shutdown(struct native_event_table_t *event_table)
int fd
Definition: iozone.c:1291
#define PAPI_EPERM
Definition: fpapi.h:120
#define PAPI_REFRESH_VALUE
Definition: perf_event.c:79
static int bug_format_group(void)
Definition: perf_event.c:180
EventSetInfo_t * ESI
device[deviceId] domain[domainId] event
Definition: linux-cuda.c:306
static int set_irange(hwd_context_t *ctx, hwd_control_state_t *current_state, _papi_int_option_t *option)
Definition: perfmon-ia64.c:919
struct perf_event_header header
Definition: perf_event.c:1708
pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS]
#define PERF_EVENTS_RUNNING
Definition: perf_event.c:50
#define PAPI_PROFIL_RANDOM
Definition: fpapi.h:76
return PAPI_OK
Definition: linux-nvml.c:458
int count
Definition: iozone.c:22422
#define PAPI_ENOCMP
Definition: fpapi.h:122
#define PMU_TYPE_OS
static int find_profile_index(EventSetInfo_t *ESI, int evt_idx, int *flags, unsigned int *native_index, int *profile_index)
Definition: perf_event.c:1796
#define PAPI_GRN_SYS
Definition: fpapi.h:71
#define PAPI_DOM_OTHER
Definition: fpapi.h:23
static pid_t mygettid(void)
Definition: darwin-common.h:11
int _pe_init_control_state(hwd_control_state_t *ctl)
Definition: perf_event.c:1441
fclose(thread_wqfd)
#define PAPI_DOM_KERNEL
Definition: fpapi.h:22
#define PAPI_GRN_PROC
Definition: fpapi.h:69
void
Definition: iozone.c:18627
return PAPI_EINVAL
Definition: linux-nvml.c:408
int _pe_libpfm4_init(papi_vector_t *my_vector, int cidx, struct native_event_table_t *event_table, int pmu_type)
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
#define PAPI_MAX_SW_MPX_EVENTS
Definition: sw_multiplex.h:4
static int check_permissions(unsigned long tid, unsigned int cpu_num, unsigned int domain, unsigned int granularity, unsigned int multiplex, unsigned int inherit)
Definition: perf_event.c:384
static int _pe_detect_rdpmc(int default_domain)
Definition: perf_event.c:1466
static int set_default_domain(EventSetInfo_t *zero, int domain)
Definition: aix.c:510
struct perf_event_header header
Definition: perf_event.c:1712
#define PAPI_OVERFLOWING
Definition: fpapi.h:33
static int processor_supported(int vendor, int family)
Definition: perf_event.c:88
#define PAPI_PROFILING
Definition: fpapi.h:34
#define PAPI_INHERIT
Definition: papi.h:456
#define PAPI_ECNFLCT
Definition: fpapi.h:113
Return codes and api definitions.
uint32_t nr_mmap_pages
FILE * fff[MAX_EVENTS]
unsigned int domain
int multiplex(void)
Definition: multiplex.c:35
_papi_int_attach_t attach
int _pe_shutdown_component(void)
Definition: perf_event.c:1625
long long ret
Definition: iozone.c:1346
unsigned int overflow
#define PAPI_ECOUNT
Definition: fpapi.h:128
unsigned long tid
int _pe_libpfm4_ntv_code_to_name(unsigned int EventCode, char *ntv_name, int len, struct native_event_table_t *event_table)
papi_vector_t _perf_event_vector
Definition: perf_event.c:56
int _pe_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *native, int count, hwd_context_t *ctx)
Definition: perf_event.c:1189
_papi_int_cpu_t cpu
int i
Definition: fileop.c:140
#define PAPI_ENOSUPP
Definition: fpapi.h:123
EventSetOverflowInfo_t overflow
int _papi_libpfm4_shutdown(void)
#define PAPI_GRN_PROCG
Definition: fpapi.h:70
#define PAPI_OVERFLOW_HARDWARE
Definition: papi.h:410
unsigned int fast_real_timer
Definition: papi.h:655
PAPI_os_info_t _papi_os_info
Definition: aix.c:1210
struct _ThreadInfo * master
#define PAPI_VENDOR_IBM
Definition: papi.h:348
int _pe_read(hwd_context_t *ctx, hwd_control_state_t *ctl, long long **events, int flags)
Definition: perf_event.c:915
#define WAKEUP_MODE_PROFILING
Definition: perf_event.c:69
#define PAPI_DOM_SUPERVISOR
Definition: fpapi.h:24
static int pe_vendor_fixups(papi_vector_t *vector)
Definition: perf_event.c:102
static int pid
char *long long size
Definition: iozone.c:12023
int _pe_set_overflow(EventSetInfo_t *ESI, int EventIndex, int threshold)
Definition: perf_event.c:2025
int _pe_stop_profiling(ThreadInfo_t *thread, EventSetInfo_t *ESI)
Definition: perf_event.c:1996
static int cidx
Definition: event_info.c:40
static int check_scheduability(pe_context_t *ctx, pe_control_t *ctl, int idx)
Definition: perf_event.c:453
#define PAPI_ECMP
Definition: fpapi.h:109
hwd_ucontext_t * ucontext
static int native
Definition: event_info.c:39
void * thread(void *arg)
Definition: kufrin.c:31
#define PERF_EVENTS_OPENED
Definition: perf_event.c:49
void *long long tid
Definition: iozone.c:18586
int _pe_ntv_name_to_code(char *name, unsigned int *event_code)
Definition: perf_event.c:1647
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
#define PAPI_PROFIL_INST_EAR
Definition: papi.h:403
#define PAPI_VENDOR_MIPS
Definition: papi.h:353
_papi_int_granularity_t granularity
static void mmap_read(int cidx, ThreadInfo_t **thr, pe_event_info_t *pe, int profile_index)
Definition: perf_event.c:1725
EventSetInfo_t * ESI
void PAPIERROR(char *format,...)
unsigned int multiplexed
int _papi_hwi_start_signal(int signal, int need_context, int cidx)
Definition: extras.c:401
#define PAPI_DOMAIN
Definition: fpapi.h:50
char events[MAX_EVENTS][BUFSIZ]
static uint64_t mmap_read_head(pe_event_info_t *pe)
Definition: perf_event.c:1680
int mmtimer_setup(void)
Definition: linux-timer.c:116
struct native_event_table_t * event_table
#define PAPI_VENDOR_ARM
Definition: papi.h:352
#define min(x, y)
Definition: darwin-common.h:4
#define PAPI_ATTACH
Definition: fpapi.h:62
#define PMU_TYPE_CORE
int _papi_hwi_stop_signal(int signal)
Definition: extras.c:441
#define WAKEUP_MODE_COUNTER_OVERFLOW
Definition: perf_event.c:68
EventSetInfo_t * ESI
#define PAPI_GRANUL
Definition: fpapi.h:52
_papi_int_multiplex_t multiplex
char * addr
Definition: iozone.c:12026
NativeInfo_t * NativeInfoArray
uint64_t id
Definition: perf_event.c:1713
EventInfo_t * EventInfoArray
int cpuid_family
Definition: papi.h:787
#define PAPI_DEF_MPX_NS
Definition: fpapi.h:53
#define PAPI_ESYS
Definition: fpapi.h:108
int threshold
int _pe_start(hwd_context_t *ctx, hwd_control_state_t *ctl)
Definition: perf_event.c:1114
#define PAPI_VENDOR_CRAY
Definition: papi.h:349
static int bug_sync_read(void)
Definition: perf_event.c:200
papi_mdi_t _papi_hwi_system_info
Definition: papi_internal.c:57
PAPI_hw_info_t hw_info
again struct sockaddr sizeof(struct sockaddr_in))
unsigned int overflow_signal
#define PAPI_DETACH
Definition: fpapi.h:66
int _pe_libpfm4_ntv_code_to_info(unsigned int EventCode, PAPI_event_info_t *info, struct native_event_table_t *event_table)
#define PAPI_VENDOR_INTEL
Definition: papi.h:346
int pos[PAPI_EVENTS_IN_DERIVED_EVENT]
static void mmap_write_tail(pe_event_info_t *pe, uint64_t tail)
Definition: perf_event.c:1697
int _pe_libpfm4_ntv_code_to_descr(unsigned int EventCode, char *ntv_descr, int len, struct native_event_table_t *event_table)
#define LINUX_VERSION(a, b, c)
Definition: linux-common.h:4
uint64_t ip
Definition: perf_event.c:1709
int _pe_init_component(int cidx)
Definition: perf_event.c:1517
int _pe_ntv_code_to_name(unsigned int EventCode, char *ntv_name, int len)
Definition: perf_event.c:1653
int vendor
Definition: papi.h:782
struct sigcontext hwd_ucontext_t
Definition: aix-context.h:10
unsigned int cpu_num
static int map_perf_event_errors_to_papi(int perf_event_error)
Definition: perf_event.c:335
#define PAPI_EBUG
Definition: fpapi.h:111
#define PAPI_OVERFLOW_FORCE_SW
Definition: papi.h:409
static int fcntl_setown_fd(int fd)
Definition: perf_event.c:214
static int set_drange(hwd_context_t *ctx, hwd_control_state_t *current_state, _papi_int_option_t *option)
Definition: perfmon-ia64.c:767
#define PAPI_DEF_ITIMER
Definition: papi.h:452
EventSetInfo_t ** running_eventset
Definition: threads.h:30
char * name
Definition: iozone.c:23648
void _papi_hwi_dispatch_profile(EventSetInfo_t *ESI, caddr_t pc, long long over, int profile_index)
Definition: extras.c:163
struct perf_event_attr attr
struct native_event_table_t perf_native_event_table
Definition: perf_event.c:59
int
Definition: iozone.c:18528
int _pe_ctl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: perf_event.c:1253
#define MAP_FAILED
Definition: iozone.c:336
int our_cidx
Definition: perf_event.c:60
inline_static ThreadInfo_t * _papi_hwi_lookup_thread(int custom_tid)
Definition: threads.h:92
#define PAPI_NATIVE_AND_MASK
uint64_t lost
Definition: perf_event.c:1714
#define PAPI_PROFIL_FORCE_SW
Definition: papi.h:401
int _pe_init_thread(hwd_context_t *hwd_ctx)
Definition: perf_event.c:1425
unsigned int inherit
struct papi_vectors * _papi_hwd[]
int _linux_detect_nmi_watchdog()
Definition: linux-common.c:584
#define F_OWNER_TID
Definition: linux-common.h:28
_papi_int_domain_t domain
gc tail
Definition: libasync.c:667
char model_string[PAPI_MAX_STR_LEN]
Definition: papi.h:785
int nmi_watchdog_active
Definition: perf_event.c:53
hwd_siginfo_t * si
static unsigned int get_read_format(unsigned int multiplex, unsigned int inherit, int format_group)
Definition: perf_event.c:248
#define PAPI_DOM_USER
Definition: fpapi.h:21
#define F_SETOWN_EX
Definition: linux-common.h:25
int _papi_hwi_dispatch_overflow_signal(void *papiContext, caddr_t address, int *isHardware, long long overflow_bit, int genOverflowBit, ThreadInfo_t **t, int cidx)
Definition: extras.c:214
EventSetInfo_t * ESI
EventSetProfileInfo_t profile
hwd_control_state_t * ctl_state
long j
Definition: iozone.c:19135
ssize_t retval
Definition: libasync.c:338
int _pe_write(hwd_context_t *ctx, hwd_control_state_t *ctl, long long *from)
Definition: perf_event.c:885
#define GET_OVERFLOW_ADDRESS(ctx)
Definition: aix-context.h:12
int _pe_set_profile(EventSetInfo_t *ESI, int EventIndex, int threshold)
Definition: perf_event.c:2118
#define PAPI_GRN_THR
Definition: fpapi.h:67
static int open_pe_events(pe_context_t *ctx, pe_control_t *ctl)
Definition: perf_event.c:599
if(gettimeofday(&tp,(struct timezone *) NULL)==-1) perror("gettimeofday")
#define PAPI_MULTIPLEX
Definition: fpapi.h:48
EventSetInfo_t * ESI
#define PAPI_VENDOR_AMD
Definition: papi.h:347
int n
Definition: mendes-alt.c:164
static int tune_up_fd(pe_control_t *ctl, int evt_idx)
Definition: perf_event.c:531