PAPI  5.3.0.0
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
perf_event.c
Go to the documentation of this file.
1 /*
2 * File: perf_event.c
3 *
4 * Author: Corey Ashford
5 * cjashfor@us.ibm.com
6 * - based upon perfmon.c written by -
7 * Philip Mucci
8 * mucci@cs.utk.edu
9 * Mods: Gary Mohr
10 * gary.mohr@bull.com
11 * Mods: Vince Weaver
12 * vweaver1@eecs.utk.edu
13 * Mods: Philip Mucci
14 * mucci@eecs.utk.edu */
15 
16 
17 #include <fcntl.h>
18 #include <string.h>
19 #include <errno.h>
20 #include <signal.h>
21 #include <syscall.h>
22 #include <sys/utsname.h>
23 #include <sys/mman.h>
24 #include <sys/ioctl.h>
25 
26 /* PAPI-specific includes */
27 #include "papi.h"
28 #include "papi_memory.h"
29 #include "papi_internal.h"
30 #include "papi_vector.h"
31 #include "extras.h"
32 
33 /* libpfm4 includes */
34 #include "papi_libpfm4_events.h"
35 #include "pe_libpfm4_events.h"
36 #include "perfmon/pfmlib.h"
37 #include PEINCLUDE
38 
39 /* Linux-specific includes */
40 #include "mb.h"
41 #include "linux-memory.h"
42 #include "linux-timer.h"
43 #include "linux-common.h"
44 #include "linux-context.h"
45 
46 #include "perf_event_lib.h"
47 
48 /* Defines for ctx->state */
49 #define PERF_EVENTS_OPENED 0x01
50 #define PERF_EVENTS_RUNNING 0x02
51 
52 /* Static globals */
54 
55 /* Forward declaration */
57 
58 /* Globals */
61 
62 /* These sentinels tell _pe_set_overflow() how to set the */
63 /* wakeup_events field in the event descriptor record. */
64 
65 #define WAKEUP_COUNTER_OVERFLOW 0
66 #define WAKEUP_PROFILING -1
67 
68 #define WAKEUP_MODE_COUNTER_OVERFLOW 0
69 #define WAKEUP_MODE_PROFILING 1
70 
71 /* The kernel developers say to never use a refresh value of 0 */
72 /* See https://lkml.org/lkml/2011/5/24/172 */
73 /* However, on some platforms (like Power) a value of 1 does not work */
74 /* We're still tracking down why this happens. */
75 
76 #if defined(__powerpc__)
77 #define PAPI_REFRESH_VALUE 0
78 #else
79 #define PAPI_REFRESH_VALUE 1
80 #endif
81 
82 /* Check for processor support */
83 /* Can be used for generic checking, though in general we only */
84 /* check for pentium4 here because support was broken for multiple */
85 /* kernel releases and the usual standard detections did not */
86 /* handle this. So we check for pentium 4 explicitly. */
87 static int
88 processor_supported(int vendor, int family) {
89 
90  /* Error out if kernel too early to support p4 */
91  if (( vendor == PAPI_VENDOR_INTEL ) && (family == 15)) {
92  if (_papi_os_info.os_version < LINUX_VERSION(2,6,35)) {
93  PAPIERROR("Pentium 4 not supported on kernels before 2.6.35");
94  return PAPI_ENOSUPP;
95  }
96  }
97  return PAPI_OK;
98 }
99 
100 /* Fix up the config based on what CPU/Vendor we are running on */
101 static int
103 {
104  /* powerpc */
105  /* On IBM and Power6 Machines default domain should include supervisor */
107  vector->cmp_info.available_domains |=
109  if (strcmp(_papi_hwi_system_info.hw_info.model_string, "POWER6" ) == 0 ) {
110  vector->cmp_info.default_domain =
112  }
113  }
114 
117  }
118 
121  vector->cmp_info.fast_real_timer = 1;
122  }
123  /* ARM */
125  /* FIXME: this will change with Cortex A15 */
126  vector->cmp_info.available_domains |=
128  vector->cmp_info.default_domain =
130  }
131 
132  /* CRAY */
135  }
136 
137  return PAPI_OK;
138 }
139 
140 
141 
142 /******************************************************************/
143 /******** Kernel Version Dependent Routines **********************/
144 /******************************************************************/
145 
146 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch
147  * implementations (e.g. x86) which don't do a static event scheduability
148  * check in sys_perf_event_open.
149  * This was fixed for x86 in the 2.6.33 kernel
150  *
151  * Also! Kernels newer than 2.6.34 will fail in a similar way
152  * if the nmi_watchdog has stolen a performance counter
153  * and we try to use the maximum number of counters.
154  * A sys_perf_event_open() will seem to succeed but will fail
155  * at read time. So re-use this work around code.
156  */
157 static int
159 
160 #if defined(__powerpc__)
161  /* PowerPC not affected by this bug */
162 #elif defined(__mips__)
163  /* MIPS as of kernel 3.1 does not properly detect schedulability */
164  return 1;
165 #else
166  if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
167 #endif
168 
169  if (nmi_watchdog_active) return 1;
170 
171  return 0;
172 }
173 
174 /* PERF_FORMAT_GROUP allows reading an entire group's counts at once */
175 /* before 2.6.34 PERF_FORMAT_GROUP did not work when reading results */
176 /* from attached processes. We are lazy and disable it for all cases */
177 /* commit was: 050735b08ca8a016bbace4445fa025b88fee770b */
178 
179 static int
181 
182  if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) return 1;
183 
184  /* MIPS, as of version 3.1, does not support this properly */
185 
186 #if defined(__mips__)
187  return 1;
188 #endif
189 
190  return 0;
191 
192 }
193 
194 
195 /* There's a bug prior to Linux 2.6.33 where if you are using */
196 /* PERF_FORMAT_GROUP, the TOTAL_TIME_ENABLED and */
197 /* TOTAL_TIME_RUNNING fields will be zero unless you disable */
198 /* the counters first */
199 static int
201 
202  if (_papi_os_info.os_version < LINUX_VERSION(2,6,33)) return 1;
203 
204  return 0;
205 
206 }
207 
208 
209 /* Set the F_SETOWN_EX flag on the fd. */
210 /* This affects which thread an overflow signal gets sent to */
211 /* Handled in a subroutine to handle the fact that the behavior */
212 /* is dependent on kernel version. */
213 static int
215 
216  int ret;
217  struct f_owner_ex fown_ex;
218 
219  /* F_SETOWN_EX is not available until 2.6.32 */
220  if (_papi_os_info.os_version < LINUX_VERSION(2,6,32)) {
221 
222  /* get ownership of the descriptor */
223  ret = fcntl( fd, F_SETOWN, mygettid( ) );
224  if ( ret == -1 ) {
225  PAPIERROR( "cannot fcntl(F_SETOWN) on %d: %s", fd, strerror(errno) );
226  return PAPI_ESYS;
227  }
228  }
229  else {
230  /* set ownership of the descriptor */
231  fown_ex.type = F_OWNER_TID;
232  fown_ex.pid = mygettid();
233  ret = fcntl(fd, F_SETOWN_EX, (unsigned long)&fown_ex );
234 
235  if ( ret == -1 ) {
236  PAPIERROR( "cannot fcntl(F_SETOWN_EX) on %d: %s",
237  fd, strerror( errno ) );
238  return PAPI_ESYS;
239  }
240  }
241  return PAPI_OK;
242 }
243 
244 /* The read format on perf_event varies based on various flags that */
245 /* are passed into it. This helper avoids copying this logic */
246 /* multiple places. */
247 static unsigned int
249  unsigned int inherit,
250  int format_group )
251 {
252  unsigned int format = 0;
253 
254  /* if we need read format options for multiplexing, add them now */
255  if (multiplex) {
256  format |= PERF_FORMAT_TOTAL_TIME_ENABLED;
257  format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
258  }
259 
260  /* if our kernel supports it and we are not using inherit, */
261  /* add the group read options */
262  if ( (!bug_format_group()) && !inherit) {
263  if (format_group) {
264  format |= PERF_FORMAT_GROUP;
265  }
266  }
267 
268  SUBDBG("multiplex: %d, inherit: %d, group_leader: %d, format: %#x\n",
269  multiplex, inherit, format_group, format);
270 
271  return format;
272 }
273 
274 /*****************************************************************/
275 /********* End Kernel-version Dependent Routines ****************/
276 /*****************************************************************/
277 
278 /*****************************************************************/
279 /********* Begin perf_event low-level code ***********************/
280 /*****************************************************************/
281 
282 /* In case headers aren't new enough to have __NR_perf_event_open */
283 #ifndef __NR_perf_event_open
284 
285 #ifdef __powerpc__
286 #define __NR_perf_event_open 319
287 #elif defined(__x86_64__)
288 #define __NR_perf_event_open 298
289 #elif defined(__i386__)
290 #define __NR_perf_event_open 336
291 #elif defined(__arm__) 366+0x900000
292 #define __NR_perf_event_open
293 #endif
294 
295 #endif
296 
297 static long
298 sys_perf_event_open( struct perf_event_attr *hw_event, pid_t pid, int cpu,
299  int group_fd, unsigned long flags )
300 {
301  int ret;
302 
303  SUBDBG("sys_perf_event_open(%p,%d,%d,%d,%lx\n",hw_event,pid,cpu,group_fd,flags);
304  SUBDBG(" type: %d\n",hw_event->type);
305  SUBDBG(" size: %d\n",hw_event->size);
306  SUBDBG(" config: %"PRIx64" (%"PRIu64")\n",hw_event->config,
307  hw_event->config);
308  SUBDBG(" sample_period: %"PRIu64"\n",hw_event->sample_period);
309  SUBDBG(" sample_type: %"PRIu64"\n",hw_event->sample_type);
310  SUBDBG(" read_format: %"PRIu64"\n",hw_event->read_format);
311  SUBDBG(" disabled: %d\n",hw_event->disabled);
312  SUBDBG(" inherit: %d\n",hw_event->inherit);
313  SUBDBG(" pinned: %d\n",hw_event->pinned);
314  SUBDBG(" exclusive: %d\n",hw_event->exclusive);
315  SUBDBG(" exclude_user: %d\n",hw_event->exclude_user);
316  SUBDBG(" exclude_kernel: %d\n",hw_event->exclude_kernel);
317  SUBDBG(" exclude_hv: %d\n",hw_event->exclude_hv);
318  SUBDBG(" exclude_idle: %d\n",hw_event->exclude_idle);
319  SUBDBG(" mmap: %d\n",hw_event->mmap);
320  SUBDBG(" comm: %d\n",hw_event->comm);
321  SUBDBG(" freq: %d\n",hw_event->freq);
322  SUBDBG(" inherit_stat: %d\n",hw_event->inherit_stat);
323  SUBDBG(" enable_on_exec: %d\n",hw_event->enable_on_exec);
324  SUBDBG(" task: %d\n",hw_event->task);
325  SUBDBG(" watermark: %d\n",hw_event->watermark);
326  ret =
327  syscall( __NR_perf_event_open, hw_event, pid, cpu, group_fd, flags );
328  SUBDBG("Returned %d %d %s\n",ret,
329  ret<0?errno:0,
330  ret<0?strerror(errno):" ");
331  return ret;
332 }
333 
334 
335 static int map_perf_event_errors_to_papi(int perf_event_error) {
336 
337  int ret;
338 
339  /* These mappings are approximate.
340  EINVAL in particular can mean lots of different things */
341  switch(perf_event_error) {
342  case EPERM:
343  case EACCES:
344  ret = PAPI_EPERM;
345  break;
346  case ENODEV:
347  case EOPNOTSUPP:
348  ret = PAPI_ENOSUPP;
349  break;
350  case ENOENT:
351  ret = PAPI_ENOEVNT;
352  break;
353  case ENOSYS:
354  case EAGAIN:
355  case EBUSY:
356  case E2BIG:
357  ret = PAPI_ESYS;
358  break;
359  case ENOMEM:
360  ret = PAPI_ENOMEM;
361  break;
362  case EINVAL:
363  default:
364  ret = PAPI_EINVAL;
365  break;
366  }
367  return ret;
368 }
369 
370 
372 /* perf_events. */
373 /* We do this by temporarily opening an event with the */
374 /* desired options then closing it again. We use the */
375 /* PERF_COUNT_HW_INSTRUCTION event as a dummy event */
376 /* on the assumption it is available on all */
377 /* platforms. */
378 
379 static int
380 check_permissions( unsigned long tid,
381  unsigned int cpu_num,
382  unsigned int domain,
383  unsigned int granularity,
384  unsigned int multiplex,
385  unsigned int inherit )
386 {
387  int ev_fd;
388  struct perf_event_attr attr;
389 
390  long pid;
391 
392  /* clearing this will set a type of hardware and to count all domains */
393  memset(&attr, '\0', sizeof(attr));
394  attr.read_format = get_read_format(multiplex, inherit, 1);
395 
396  /* set the event id (config field) to instructios */
397  /* (an event that should always exist) */
398  /* This was cycles but that is missing on Niagara */
399  attr.config = PERF_COUNT_HW_INSTRUCTIONS;
400 
401  /* now set up domains this event set will be counting */
402  if (!(domain & PAPI_DOM_SUPERVISOR)) {
403  attr.exclude_hv = 1;
404  }
405  if (!(domain & PAPI_DOM_USER)) {
406  attr.exclude_user = 1;
407  }
408  if (!(domain & PAPI_DOM_KERNEL)) {
409  attr.exclude_kernel = 1;
410  }
411 
412  if (granularity==PAPI_GRN_SYS) {
413  pid = -1;
414  } else {
415  pid = tid;
416  }
417 
418  SUBDBG("Calling sys_perf_event_open() from check_permissions\n");
419 
420  ev_fd = sys_perf_event_open( &attr, pid, cpu_num, -1, 0 );
421  if ( ev_fd == -1 ) {
422  SUBDBG("sys_perf_event_open returned error. Linux says, %s",
423  strerror( errno ) );
425  }
426 
427  /* now close it, this was just to make sure we have permissions */
428  /* to set these options */
429  close(ev_fd);
430  return PAPI_OK;
431 }
432 
433 /* Maximum size we ever expect to read from a perf_event fd */
434 /* (this is the number of 64-bit values) */
435 /* We use this to size the read buffers */
436 /* The three is for event count, time_enabled, time_running */
437 /* and the counter term is count value and count id for each */
438 /* possible counter value. */
439 #define READ_BUFFER_SIZE (3 + (2 * PERF_EVENT_MAX_MPX_COUNTERS))
440 
441 
442 
443 /* KERNEL_CHECKS_SCHEDUABILITY_UPON_OPEN is a work-around for kernel arch */
444 /* implementations (e.g. x86 before 2.6.33) which don't do a static event */
445 /* scheduability check in sys_perf_event_open. It is also needed if the */
446 /* kernel is stealing an event, such as when NMI watchdog is enabled. */
447 
448 static int
450 {
451  int retval = 0, cnt = -1;
452  ( void ) ctx; /*unused */
453  long long papi_pe_buffer[READ_BUFFER_SIZE];
454  int i,group_leader_fd;
455 
456  if (bug_check_scheduability()) {
457 
458  /* If the kernel isn't tracking scheduability right */
459  /* Then we need to start/stop/read to force the event */
460  /* to be scheduled and see if an error condition happens. */
461 
462  /* get the proper fd to start */
463  group_leader_fd=ctl->events[idx].group_leader_fd;
464  if (group_leader_fd==-1) group_leader_fd=ctl->events[idx].event_fd;
465 
466  /* start the event */
467  retval = ioctl( group_leader_fd, PERF_EVENT_IOC_ENABLE, NULL );
468  if (retval == -1) {
469  PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
470  return PAPI_ESYS;
471  }
472 
473  /* stop the event */
474  retval = ioctl(group_leader_fd, PERF_EVENT_IOC_DISABLE, NULL );
475  if (retval == -1) {
476  PAPIERROR( "ioctl(PERF_EVENT_IOC_DISABLE) failed.\n" );
477  return PAPI_ESYS;
478  }
479 
480  /* See if a read returns any results */
481  cnt = read( group_leader_fd, papi_pe_buffer, sizeof(papi_pe_buffer));
482  if ( cnt == -1 ) {
483  SUBDBG( "read returned an error! Should never happen.\n" );
484  return PAPI_ESYS;
485  }
486 
487  if ( cnt == 0 ) {
488  /* We read 0 bytes if we could not schedule the event */
489  /* The kernel should have detected this at open */
490  /* but various bugs (including NMI watchdog) */
491  /* result in this behavior */
492 
493  return PAPI_ECNFLCT;
494 
495  } else {
496 
497  /* Reset all of the counters (opened so far) back to zero */
498  /* from the above brief enable/disable call pair. */
499 
500  /* We have to reset all events because reset of group leader */
501  /* does not reset all. */
502  /* we assume that the events are being added one by one and that */
503  /* we do not need to reset higher events (doing so may reset ones */
504  /* that have not been initialized yet. */
505 
506  /* Note... PERF_EVENT_IOC_RESET does not reset time running */
507  /* info if multiplexing, so we should avoid coming here if */
508  /* we are multiplexing the event. */
509  for( i = 0; i < idx; i++) {
510  retval=ioctl( ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
511  if (retval == -1) {
512  PAPIERROR( "ioctl(PERF_EVENT_IOC_RESET) #%d/%d %d "
513  "(fd %d)failed.\n",
514  i,ctl->num_events,idx,ctl->events[i].event_fd);
515  return PAPI_ESYS;
516  }
517  }
518  }
519  }
520  return PAPI_OK;
521 }
522 
523 
524 /* Do some extra work on a perf_event fd if we're doing sampling */
525 /* This mostly means setting up the mmap buffer. */
526 static int
527 tune_up_fd( pe_control_t *ctl, int evt_idx )
528 {
529  int ret;
530  void *buf_addr;
531  int fd = ctl->events[evt_idx].event_fd;
532 
533  /* Register that we would like a SIGIO notification when a mmap'd page */
534  /* becomes full. */
535  ret = fcntl( fd, F_SETFL, O_ASYNC | O_NONBLOCK );
536  if ( ret ) {
537  PAPIERROR ( "fcntl(%d, F_SETFL, O_ASYNC | O_NONBLOCK) "
538  "returned error: %s", fd, strerror( errno ) );
539  return PAPI_ESYS;
540  }
541 
542  /* Set the F_SETOWN_EX flag on the fd. */
543  /* This affects which thread an overflow signal gets sent to. */
544  ret=fcntl_setown_fd(fd);
545  if (ret!=PAPI_OK) return ret;
546 
547  /* Set FD_CLOEXEC. Otherwise if we do an exec with an overflow */
548  /* running, the overflow handler will continue into the exec()'d*/
549  /* process and kill it because no signal handler is set up. */
550  ret=fcntl(fd, F_SETFD, FD_CLOEXEC);
551  if (ret) {
552  return PAPI_ESYS;
553  }
554 
555  /* when you explicitely declare that you want a particular signal, */
556  /* even with you use the default signal, the kernel will send more */
557  /* information concerning the event to the signal handler. */
558  /* */
559  /* In particular, it will send the file descriptor from which the */
560  /* event is originating which can be quite useful when monitoring */
561  /* multiple tasks from a single thread. */
562  ret = fcntl( fd, F_SETSIG, ctl->overflow_signal );
563  if ( ret == -1 ) {
564  PAPIERROR( "cannot fcntl(F_SETSIG,%d) on %d: %s",
565  ctl->overflow_signal, fd,
566  strerror( errno ) );
567  return PAPI_ESYS;
568  }
569 
570  /* mmap() the sample buffer */
571  buf_addr = mmap( NULL, ctl->events[evt_idx].nr_mmap_pages * getpagesize(),
572  PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0 );
573  if ( buf_addr == MAP_FAILED ) {
574  PAPIERROR( "mmap(NULL,%d,%d,%d,%d,0): %s",
575  ctl->events[evt_idx].nr_mmap_pages * getpagesize( ),
576  PROT_READ, MAP_SHARED, fd, strerror( errno ) );
577  return ( PAPI_ESYS );
578  }
579 
580  SUBDBG( "Sample buffer for fd %d is located at %p\n", fd, buf_addr );
581 
582  /* Set up the mmap buffer and its associated helpers */
583  ctl->events[evt_idx].mmap_buf = (struct perf_counter_mmap_page *) buf_addr;
584  ctl->events[evt_idx].tail = 0;
585  ctl->events[evt_idx].mask = ( ctl->events[evt_idx].nr_mmap_pages - 1 ) *
586  getpagesize() - 1;
587 
588  return PAPI_OK;
589 }
590 
591 
592 
593 /* Open all events in the control state */
594 static int
596 {
597 
598  int i, ret = PAPI_OK;
599  long pid;
600 
601  if (ctl->granularity==PAPI_GRN_SYS) {
602  pid = -1;
603  }
604  else {
605  pid = ctl->tid;
606  }
607 
608  for( i = 0; i < ctl->num_events; i++ ) {
609 
610  ctl->events[i].event_opened=0;
611 
612  /* set up the attr structure. We don't set up all fields here */
613  /* as some have already been set up previously. */
614 
615  /* group leader (event 0) is special */
616  /* If we're multiplexed, everyone is a group leader */
617  if (( i == 0 ) || (ctl->multiplexed)) {
618  ctl->events[i].attr.pinned = !ctl->multiplexed;
619  ctl->events[i].attr.disabled = 1;
620  ctl->events[i].group_leader_fd=-1;
621  ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed,
622  ctl->inherit,
623  !ctl->multiplexed );
624  } else {
625  ctl->events[i].attr.pinned=0;
626  ctl->events[i].attr.disabled = 0;
627  ctl->events[i].group_leader_fd=ctl->events[0].event_fd;
628  ctl->events[i].attr.read_format = get_read_format(ctl->multiplexed,
629  ctl->inherit,
630  0 );
631  }
632 
633 
634  /* try to open */
635  ctl->events[i].event_fd = sys_perf_event_open( &ctl->events[i].attr,
636  pid,
637  ctl->cpu,
638  ctl->events[i].group_leader_fd,
639  0 /* flags */
640  );
641 
642  /* Try to match Linux errors to PAPI errors */
643  if ( ctl->events[i].event_fd == -1 ) {
644  SUBDBG("sys_perf_event_open returned error on event #%d."
645  " Error: %s\n",
646  i, strerror( errno ) );
648 
649  goto open_pe_cleanup;
650  }
651 
652  SUBDBG ("sys_perf_event_open: tid: %ld, cpu_num: %d,"
653  " group_leader/fd: %d, event_fd: %d,"
654  " read_format: 0x%"PRIu64"\n",
655  pid, ctl->cpu, ctl->events[i].group_leader_fd,
656  ctl->events[i].event_fd, ctl->events[i].attr.read_format);
657 
658 
659  /* in many situations the kernel will indicate we opened fine */
660  /* yet things will fail later. So we need to double check */
661  /* we actually can use the events we've set up. */
662 
663  /* This is not necessary if we are multiplexing, and in fact */
664  /* we cannot do this properly if multiplexed because */
665  /* PERF_EVENT_IOC_RESET does not reset the time running info */
666  if (!ctl->multiplexed) {
667  ret = check_scheduability( ctx, ctl, i );
668 
669  if ( ret != PAPI_OK ) {
670  /* the last event did open, so we need to bump the counter */
671  /* before doing the cleanup */
672  i++;
673  goto open_pe_cleanup;
674  }
675  }
676  ctl->events[i].event_opened=1;
677  }
678 
679  /* Now that we've successfully opened all of the events, do whatever */
680  /* "tune-up" is needed to attach the mmap'd buffers, signal handlers, */
681  /* and so on. */
682  for ( i = 0; i < ctl->num_events; i++ ) {
683 
684  /* If sampling is enabled, hook up signal handler */
685  if ( ctl->events[i].attr.sample_period ) {
686  ret = tune_up_fd( ctl, i );
687  if ( ret != PAPI_OK ) {
688  /* All of the fds are open, so we need to clean up all of them */
689  i = ctl->num_events;
690  goto open_pe_cleanup;
691  }
692  } else {
693  /* Make sure this is NULL so close_pe_events works right */
694  ctl->events[i].mmap_buf = NULL;
695  }
696  }
697 
698  /* Set num_evts only if completely successful */
699  ctx->state |= PERF_EVENTS_OPENED;
700 
701  return PAPI_OK;
702 
703 open_pe_cleanup:
704  /* We encountered an error, close up the fds we successfully opened. */
705  /* We go backward in an attempt to close group leaders last, although */
706  /* That's probably not strictly necessary. */
707  while ( i > 0 ) {
708  i--;
709  if (ctl->events[i].event_fd>=0) {
710  close( ctl->events[i].event_fd );
711  ctl->events[i].event_opened=0;
712  }
713  }
714 
715  return ret;
716 }
717 
718 /* Close all of the opened events */
719 static int
721 {
722  int i;
723  int num_closed=0;
724  int events_not_opened=0;
725 
726  /* should this be a more serious error? */
727  if ( ctx->state & PERF_EVENTS_RUNNING ) {
728  SUBDBG("Closing without stopping first\n");
729  }
730 
731  /* Close child events first */
732  for( i=0; i<ctl->num_events; i++ ) {
733 
734  if (ctl->events[i].event_opened) {
735 
736  if (ctl->events[i].group_leader_fd!=-1) {
737  if ( ctl->events[i].mmap_buf ) {
738  if ( munmap ( ctl->events[i].mmap_buf,
739  ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
740  PAPIERROR( "munmap of fd = %d returned error: %s",
741  ctl->events[i].event_fd, strerror( errno ) );
742  return PAPI_ESYS;
743  }
744  }
745 
746  if ( close( ctl->events[i].event_fd ) ) {
747  PAPIERROR( "close of fd = %d returned error: %s",
748  ctl->events[i].event_fd, strerror( errno ) );
749  return PAPI_ESYS;
750  } else {
751  num_closed++;
752  }
753  ctl->events[i].event_opened=0;
754  }
755  }
756  else {
757  events_not_opened++;
758  }
759  }
760 
761  /* Close the group leaders last */
762  for( i=0; i<ctl->num_events; i++ ) {
763 
764  if (ctl->events[i].event_opened) {
765 
766  if (ctl->events[i].group_leader_fd==-1) {
767  if ( ctl->events[i].mmap_buf ) {
768  if ( munmap ( ctl->events[i].mmap_buf,
769  ctl->events[i].nr_mmap_pages * getpagesize() ) ) {
770  PAPIERROR( "munmap of fd = %d returned error: %s",
771  ctl->events[i].event_fd, strerror( errno ) );
772  return PAPI_ESYS;
773  }
774  }
775 
776 
777  if ( close( ctl->events[i].event_fd ) ) {
778  PAPIERROR( "close of fd = %d returned error: %s",
779  ctl->events[i].event_fd, strerror( errno ) );
780  return PAPI_ESYS;
781  } else {
782  num_closed++;
783  }
784  ctl->events[i].event_opened=0;
785  }
786  }
787  }
788 
789 
790  if (ctl->num_events!=num_closed) {
791  if (ctl->num_events!=(num_closed+events_not_opened)) {
792  PAPIERROR("Didn't close all events: "
793  "Closed %d Not Opened: %d Expected %d\n",
794  num_closed,events_not_opened,ctl->num_events);
795  return PAPI_EBUG;
796  }
797  }
798 
799  ctl->num_events=0;
800 
801  ctx->state &= ~PERF_EVENTS_OPENED;
802 
803  return PAPI_OK;
804 }
805 
806 
807 /********************************************************************/
808 /********************************************************************/
809 /* Functions that are exported via the component interface */
810 /********************************************************************/
811 /********************************************************************/
812 
813 
814 /* set the domain. FIXME: perf_events allows per-event control of this. */
815 /* we do not handle that yet. */
816 int
818 {
819 
820  int i;
821  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
822 
823  SUBDBG("old control domain %d, new domain %d\n",
824  pe_ctl->domain,domain);
825 
826  pe_ctl->domain = domain;
827 
828  /* Force the domain on all events */
829  for( i = 0; i < pe_ctl->num_events; i++ ) {
830  pe_ctl->events[i].attr.exclude_user =
831  !( pe_ctl->domain & PAPI_DOM_USER );
832  pe_ctl->events[i].attr.exclude_kernel =
833  !( pe_ctl->domain & PAPI_DOM_KERNEL );
834  pe_ctl->events[i].attr.exclude_hv =
835  !( pe_ctl->domain & PAPI_DOM_SUPERVISOR );
836  }
837  return PAPI_OK;
838 }
839 
840 /* Shutdown a thread */
841 int
843 {
844  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
845 
846  pe_ctx->initialized=0;
847 
848  return PAPI_OK;
849 }
850 
851 
852 /* reset the hardware counters */
853 /* Note: PAPI_reset() does not necessarily call this */
854 /* unless the events are actually running. */
855 int
857 {
858  int i, ret;
859  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
860 
861  ( void ) ctx; /*unused */
862 
863  /* We need to reset all of the events, not just the group leaders */
864  for( i = 0; i < pe_ctl->num_events; i++ ) {
865  ret = ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_RESET, NULL );
866  if ( ret == -1 ) {
867  PAPIERROR("ioctl(%d, PERF_EVENT_IOC_RESET, NULL) "
868  "returned error, Linux says: %s",
869  pe_ctl->events[i].event_fd, strerror( errno ) );
870  return PAPI_ESYS;
871  }
872  }
873 
874  return PAPI_OK;
875 }
876 
877 
878 /* write (set) the hardware counters */
879 /* Current we do not support this. */
880 int
882  long long *from )
883 {
884  ( void ) ctx; /*unused */
885  ( void ) ctl; /*unused */
886  ( void ) from; /*unused */
887  /*
888  * Counters cannot be written. Do we need to virtualize the
889  * counters so that they can be written, or perhaps modify code so that
890  * they can be written? FIXME ?
891  */
892 
893  return PAPI_ENOSUPP;
894 }
895 
896 /*
897  * perf_event provides a complicated read interface.
898  * the info returned by read() varies depending on whether
899  * you have PERF_FORMAT_GROUP, PERF_FORMAT_TOTAL_TIME_ENABLED,
900  * PERF_FORMAT_TOTAL_TIME_RUNNING, or PERF_FORMAT_ID set
901  *
902  * To simplify things we just always ask for everything. This might
903  * lead to overhead when reading more than we need, but it makes the
904  * read code a lot simpler than the original implementation we had here.
905  *
906  * For more info on the layout see include/linux/perf_event.h
907  *
908  */
909 
910 int
912  long long **events, int flags )
913 {
914  ( void ) flags; /*unused */
915  int i, ret = -1;
916  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
917  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
918  long long papi_pe_buffer[READ_BUFFER_SIZE];
919  long long tot_time_running, tot_time_enabled, scale;
920 
921  /* On kernels before 2.6.33 the TOTAL_TIME_ENABLED and TOTAL_TIME_RUNNING */
922  /* fields are always 0 unless the counter is disabled. So if we are on */
923  /* one of these kernels, then we must disable events before reading. */
924 
925  /* Elsewhere though we disable multiplexing on kernels before 2.6.34 */
926  /* so maybe this isn't even necessary. */
927 
928  if (bug_sync_read()) {
929  if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
930  for ( i = 0; i < pe_ctl->num_events; i++ ) {
931  /* disable only the group leaders */
932  if ( pe_ctl->events[i].group_leader_fd == -1 ) {
933  ret = ioctl( pe_ctl->events[i].event_fd,
934  PERF_EVENT_IOC_DISABLE, NULL );
935  if ( ret == -1 ) {
936  PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) "
937  "returned an error: ", strerror( errno ));
938  return PAPI_ESYS;
939  }
940  }
941  }
942  }
943  }
944 
945 
946  /* Handle case where we are multiplexing */
947  if (pe_ctl->multiplexed) {
948 
949  /* currently we handle multiplexing by having individual events */
950  /* so we read from each in turn. */
951 
952  for ( i = 0; i < pe_ctl->num_events; i++ ) {
953 
954  ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
955  sizeof ( papi_pe_buffer ) );
956  if ( ret == -1 ) {
957  PAPIERROR("read returned an error: ", strerror( errno ));
958  return PAPI_ESYS;
959  }
960 
961  /* We should read 3 64-bit values from the counter */
962  if (ret<(signed)(3*sizeof(long long))) {
963  PAPIERROR("Error! short read!\n");
964  return PAPI_ESYS;
965  }
966 
967  SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
968  pe_ctl->events[i].event_fd,
969  (long)pe_ctl->tid, pe_ctl->cpu, ret);
970  SUBDBG("read: %lld %lld %lld\n",papi_pe_buffer[0],
971  papi_pe_buffer[1],papi_pe_buffer[2]);
972 
973  tot_time_enabled = papi_pe_buffer[1];
974  tot_time_running = papi_pe_buffer[2];
975 
976  SUBDBG("count[%d] = (papi_pe_buffer[%d] %lld * "
977  "tot_time_enabled %lld) / tot_time_running %lld\n",
978  i, 0,papi_pe_buffer[0],
979  tot_time_enabled,tot_time_running);
980 
981  if (tot_time_running == tot_time_enabled) {
982  /* No scaling needed */
983  pe_ctl->counts[i] = papi_pe_buffer[0];
984  } else if (tot_time_running && tot_time_enabled) {
985  /* Scale factor of 100 to avoid overflows when computing */
986  /*enabled/running */
987 
988  scale = (tot_time_enabled * 100LL) / tot_time_running;
989  scale = scale * papi_pe_buffer[0];
990  scale = scale / 100LL;
991  pe_ctl->counts[i] = scale;
992  } else {
993  /* This should not happen, but Phil reports it sometime does. */
994  SUBDBG("perf_event kernel bug(?) count, enabled, "
995  "running: %lld, %lld, %lld\n",
996  papi_pe_buffer[0],tot_time_enabled,
997  tot_time_running);
998 
999  pe_ctl->counts[i] = papi_pe_buffer[0];
1000  }
1001  }
1002  }
1003 
1004  /* Handle cases where we cannot use FORMAT GROUP */
1005  else if (bug_format_group() || pe_ctl->inherit) {
1006 
1007  /* we must read each counter individually */
1008  for ( i = 0; i < pe_ctl->num_events; i++ ) {
1009 
1010  ret = read( pe_ctl->events[i].event_fd, papi_pe_buffer,
1011  sizeof ( papi_pe_buffer ) );
1012  if ( ret == -1 ) {
1013  PAPIERROR("read returned an error: ", strerror( errno ));
1014  return PAPI_ESYS;
1015  }
1016 
1017  /* we should read one 64-bit value from each counter */
1018  if (ret!=sizeof(long long)) {
1019  PAPIERROR("Error! short read!\n");
1020  PAPIERROR("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
1021  pe_ctl->events[i].event_fd,
1022  (long)pe_ctl->tid, pe_ctl->cpu, ret);
1023  return PAPI_ESYS;
1024  }
1025 
1026  SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
1027  pe_ctl->events[i].event_fd, (long)pe_ctl->tid,
1028  pe_ctl->cpu, ret);
1029  SUBDBG("read: %lld\n",papi_pe_buffer[0]);
1030 
1031  pe_ctl->counts[i] = papi_pe_buffer[0];
1032  }
1033  }
1034 
1035 
1036  /* Handle cases where we are using FORMAT_GROUP */
1037  /* We assume only one group leader, in position 0 */
1038 
1039  else {
1040  if (pe_ctl->events[0].group_leader_fd!=-1) {
1041  PAPIERROR("Was expecting group leader!\n");
1042  }
1043 
1044  ret = read( pe_ctl->events[0].event_fd, papi_pe_buffer,
1045  sizeof ( papi_pe_buffer ) );
1046 
1047  if ( ret == -1 ) {
1048  PAPIERROR("read returned an error: ", strerror( errno ));
1049  return PAPI_ESYS;
1050  }
1051 
1052  /* we read 1 64-bit value (number of events) then */
1053  /* num_events more 64-bit values that hold the counts */
1054  if (ret<(signed)((1+pe_ctl->num_events)*sizeof(long long))) {
1055  PAPIERROR("Error! short read!\n");
1056  return PAPI_ESYS;
1057  }
1058 
1059  SUBDBG("read: fd: %2d, tid: %ld, cpu: %d, ret: %d\n",
1060  pe_ctl->events[0].event_fd,
1061  (long)pe_ctl->tid, pe_ctl->cpu, ret);
1062  {
1063  int j;
1064  for(j=0;j<ret/8;j++) {
1065  SUBDBG("read %d: %lld\n",j,papi_pe_buffer[j]);
1066  }
1067  }
1068 
1069  /* Make sure the kernel agrees with how many events we have */
1070  if (papi_pe_buffer[0]!=pe_ctl->num_events) {
1071  PAPIERROR("Error! Wrong number of events!\n");
1072  return PAPI_ESYS;
1073  }
1074 
1075  /* put the count values in their proper location */
1076  for(i=0;i<papi_pe_buffer[0];i++) {
1077  pe_ctl->counts[i] = papi_pe_buffer[1+i];
1078  }
1079  }
1080 
1081 
1082  /* If we disabled the counters due to the sync_read_bug(), */
1083  /* then we need to re-enable them now. */
1084  if (bug_sync_read()) {
1085  if ( pe_ctx->state & PERF_EVENTS_RUNNING ) {
1086  for ( i = 0; i < pe_ctl->num_events; i++ ) {
1087  if ( pe_ctl->events[i].group_leader_fd == -1 ) {
1088  /* this should refresh any overflow counters too */
1089  ret = ioctl( pe_ctl->events[i].event_fd,
1090  PERF_EVENT_IOC_ENABLE, NULL );
1091  if ( ret == -1 ) {
1092  /* Should never happen */
1093  PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) returned an error: ",
1094  strerror( errno ));
1095  return PAPI_ESYS;
1096  }
1097  }
1098  }
1099  }
1100  }
1101 
1102  /* point PAPI to the values we read */
1103  *events = pe_ctl->counts;
1104 
1105  return PAPI_OK;
1106 }
1107 
1108 /* Start counting events */
1109 int
1111 {
1112  int ret;
1113  int i;
1114  int did_something = 0;
1115  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1116  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1117 
1118  /* Reset the counters first. Is this necessary? */
1119  ret = _pe_reset( pe_ctx, pe_ctl );
1120  if ( ret ) {
1121  return ret;
1122  }
1123 
1124  /* Enable all of the group leaders */
1125  /* All group leaders have a group_leader_fd of -1 */
1126  for( i = 0; i < pe_ctl->num_events; i++ ) {
1127  if (pe_ctl->events[i].group_leader_fd == -1) {
1128  SUBDBG("ioctl(enable): fd: %d\n", pe_ctl->events[i].event_fd);
1129  ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_ENABLE, NULL) ;
1130 
1131  /* ioctls always return -1 on failure */
1132  if (ret == -1) {
1133  PAPIERROR("ioctl(PERF_EVENT_IOC_ENABLE) failed.\n");
1134  return PAPI_ESYS;
1135  }
1136 
1137  did_something++;
1138  }
1139  }
1140 
1141  if (!did_something) {
1142  PAPIERROR("Did not enable any counters.\n");
1143  return PAPI_EBUG;
1144  }
1145 
1146  pe_ctx->state |= PERF_EVENTS_RUNNING;
1147 
1148  return PAPI_OK;
1149 
1150 }
1151 
1152 /* Stop all of the counters */
1153 int
1155 {
1156 
1157  int ret;
1158  int i;
1159  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1160  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1161 
1162  /* Just disable the group leaders */
1163  for ( i = 0; i < pe_ctl->num_events; i++ ) {
1164  if ( pe_ctl->events[i].group_leader_fd == -1 ) {
1165  ret=ioctl( pe_ctl->events[i].event_fd, PERF_EVENT_IOC_DISABLE, NULL);
1166  if ( ret == -1 ) {
1167  PAPIERROR( "ioctl(%d, PERF_EVENT_IOC_DISABLE, NULL) "
1168  "returned error, Linux says: %s",
1169  pe_ctl->events[i].event_fd, strerror( errno ) );
1170  return PAPI_EBUG;
1171  }
1172  }
1173  }
1174 
1175  pe_ctx->state &= ~PERF_EVENTS_RUNNING;
1176 
1177  return PAPI_OK;
1178 }
1179 
1180 /* This function clears the current contents of the control structure and
1181  updates it with whatever resources are allocated for all the native events
1182  in the native info structure array. */
1183 
1184 int
1187  int count, hwd_context_t *ctx )
1188 {
1189  int i = 0, ret;
1190  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1191  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1192 
1193  /* close all of the existing fds and start over again */
1194  /* In theory we could have finer-grained control and know if */
1195  /* things were changed, but it's easier to tear things down and rebuild. */
1196  close_pe_events( pe_ctx, pe_ctl );
1197 
1198  /* Calling with count==0 should be OK, it's how things are deallocated */
1199  /* when an eventset is destroyed. */
1200  if ( count == 0 ) {
1201  SUBDBG( "Called with count == 0\n" );
1202  return PAPI_OK;
1203  }
1204 
1205  /* set up all the events */
1206  for( i = 0; i < count; i++ ) {
1207  if ( native ) {
1208  /* Have libpfm4 set the config values for the event */
1210  native[i].ni_event,
1211  pe_ctx->event_table);
1212  SUBDBG( "pe_ctl->eventss[%d].config=%"PRIx64"\n",i,
1213  pe_ctl->events[i].attr.config);
1214  if (ret!=PAPI_OK) return ret;
1215 
1216  } else {
1217  /* I'm not sure how we'd end up in this case */
1218  /* should it be an error? */
1219  }
1220 
1221  /* Copy the inherit flag into the attribute block that will be */
1222  /* passed to the kernel */
1223  pe_ctl->events[i].attr.inherit = pe_ctl->inherit;
1224 
1225  /* Set the position in the native structure */
1226  /* We just set up events linearly */
1227  if ( native ) {
1228  native[i].ni_position = i;
1229  }
1230  }
1231 
1232  pe_ctl->num_events = count;
1233  _pe_set_domain( ctl, pe_ctl->domain );
1234 
1235  /* actuall open the events */
1236  /* (why is this a separate function?) */
1237  ret = open_pe_events( pe_ctx, pe_ctl );
1238  if ( ret != PAPI_OK ) {
1239  SUBDBG("open_pe_events failed\n");
1240  /* Restore values ? */
1241  return ret;
1242  }
1243 
1244  return PAPI_OK;
1245 }
1246 
1247 /* Set various options on a control state */
1248 int
1249 _pe_ctl( hwd_context_t *ctx, int code, _papi_int_option_t *option )
1250 {
1251  int ret;
1252  pe_context_t *pe_ctx = ( pe_context_t *) ctx;
1253  pe_control_t *pe_ctl = NULL;
1254 
1255  switch ( code ) {
1256  case PAPI_MULTIPLEX:
1257  pe_ctl = ( pe_control_t * ) ( option->multiplex.ESI->ctl_state );
1258  ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
1259  pe_ctl->granularity,
1260  1, pe_ctl->inherit );
1261  if (ret != PAPI_OK) {
1262  return ret;
1263  }
1264 
1265  /* looks like we are allowed, so set multiplexed attribute */
1266  pe_ctl->multiplexed = 1;
1267  ret = _pe_update_control_state( pe_ctl, NULL,
1268  pe_ctl->num_events, pe_ctx );
1269  if (ret != PAPI_OK) {
1270  pe_ctl->multiplexed = 0;
1271  }
1272  return ret;
1273 
1274  case PAPI_ATTACH:
1275  pe_ctl = ( pe_control_t * ) ( option->attach.ESI->ctl_state );
1276  ret = check_permissions( option->attach.tid, pe_ctl->cpu,
1277  pe_ctl->domain, pe_ctl->granularity,
1278  pe_ctl->multiplexed,
1279  pe_ctl->inherit );
1280  if (ret != PAPI_OK) {
1281  return ret;
1282  }
1283 
1284  pe_ctl->tid = option->attach.tid;
1285 
1286  /* If events have been already been added, something may */
1287  /* have been done to the kernel, so update */
1288  ret =_pe_update_control_state( pe_ctl, NULL,
1289  pe_ctl->num_events, pe_ctx);
1290 
1291  return ret;
1292 
1293  case PAPI_DETACH:
1294  pe_ctl = ( pe_control_t *) ( option->attach.ESI->ctl_state );
1295 
1296  pe_ctl->tid = 0;
1297  return PAPI_OK;
1298 
1299  case PAPI_CPU_ATTACH:
1300  pe_ctl = ( pe_control_t *) ( option->cpu.ESI->ctl_state );
1301  ret = check_permissions( pe_ctl->tid, option->cpu.cpu_num,
1302  pe_ctl->domain, pe_ctl->granularity,
1303  pe_ctl->multiplexed,
1304  pe_ctl->inherit );
1305  if (ret != PAPI_OK) {
1306  return ret;
1307  }
1308  /* looks like we are allowed so set cpu number */
1309 
1310  /* this tells the kernel not to count for a thread */
1311  /* should we warn if we try to set both? perf_event */
1312  /* will reject it. */
1313  pe_ctl->tid = -1;
1314 
1315  pe_ctl->cpu = option->cpu.cpu_num;
1316 
1317  return PAPI_OK;
1318 
1319  case PAPI_DOMAIN:
1320  pe_ctl = ( pe_control_t *) ( option->domain.ESI->ctl_state );
1321  ret = check_permissions( pe_ctl->tid, pe_ctl->cpu,
1322  option->domain.domain,
1323  pe_ctl->granularity,
1324  pe_ctl->multiplexed,
1325  pe_ctl->inherit );
1326  if (ret != PAPI_OK) {
1327  return ret;
1328  }
1329  /* looks like we are allowed, so set counting domain */
1330  return _pe_set_domain( pe_ctl, option->domain.domain );
1331 
1332  case PAPI_GRANUL:
1333  pe_ctl = (pe_control_t *) ( option->granularity.ESI->ctl_state );
1334 
1335  /* FIXME: we really don't support this yet */
1336 
1337  switch ( option->granularity.granularity ) {
1338  case PAPI_GRN_PROCG:
1339  case PAPI_GRN_SYS_CPU:
1340  case PAPI_GRN_PROC:
1341  return PAPI_ECMP;
1342 
1343  /* Currently we only support thread and CPU granularity */
1344  case PAPI_GRN_SYS:
1345  pe_ctl->granularity=PAPI_GRN_SYS;
1346  break;
1347 
1348  case PAPI_GRN_THR:
1349  pe_ctl->granularity=PAPI_GRN_THR;
1350  break;
1351 
1352 
1353  default:
1354  return PAPI_EINVAL;
1355  }
1356  return PAPI_OK;
1357 
1358  case PAPI_INHERIT:
1359  pe_ctl = (pe_control_t *) ( option->inherit.ESI->ctl_state );
1360  ret = check_permissions( pe_ctl->tid, pe_ctl->cpu, pe_ctl->domain,
1361  pe_ctl->granularity, pe_ctl->multiplexed,
1362  option->inherit.inherit );
1363  if (ret != PAPI_OK) {
1364  return ret;
1365  }
1366  /* looks like we are allowed, so set the requested inheritance */
1367  if (option->inherit.inherit) {
1368  /* children will inherit counters */
1369  pe_ctl->inherit = 1;
1370  } else {
1371  /* children won't inherit counters */
1372  pe_ctl->inherit = 0;
1373  }
1374  return PAPI_OK;
1375 
1376  case PAPI_DATA_ADDRESS:
1377  return PAPI_ENOSUPP;
1378 #if 0
1379  pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
1380  ret = set_default_domain( pe_ctl, option->address_range.domain );
1381  if ( ret != PAPI_OK ) {
1382  return ret;
1383  }
1384  set_drange( pe_ctx, pe_ctl, option );
1385  return PAPI_OK;
1386 #endif
1387  case PAPI_INSTR_ADDRESS:
1388  return PAPI_ENOSUPP;
1389 #if 0
1390  pe_ctl = (pe_control_t *) (option->address_range.ESI->ctl_state);
1391  ret = set_default_domain( pe_ctl, option->address_range.domain );
1392  if ( ret != PAPI_OK ) {
1393  return ret;
1394  }
1395  set_irange( pe_ctx, pe_ctl, option );
1396  return PAPI_OK;
1397 #endif
1398 
1399  case PAPI_DEF_ITIMER:
1400  /* What should we be checking for here? */
1401  /* This seems like it should be OS-specific not component */
1402  /* specific. */
1403 
1404  return PAPI_OK;
1405 
1406  case PAPI_DEF_MPX_NS:
1407  /* Defining a given ns per set is not current supported */
1408  return PAPI_ENOSUPP;
1409 
1410  case PAPI_DEF_ITIMER_NS:
1411  /* We don't support this... */
1412  return PAPI_OK;
1413 
1414  default:
1415  return PAPI_ENOSUPP;
1416  }
1417 }
1418 
1419 /* Initialize a thread */
1420 int
1422 {
1423 
1424  pe_context_t *pe_ctx = ( pe_context_t *) hwd_ctx;
1425 
1426  /* clear the context structure and mark as initialized */
1427  memset( pe_ctx, 0, sizeof ( pe_context_t ) );
1428  pe_ctx->initialized=1;
1430  pe_ctx->cidx=our_cidx;
1431 
1432  return PAPI_OK;
1433 }
1434 
1435 /* Initialize a new control state */
1436 int
1438 {
1439  pe_control_t *pe_ctl = ( pe_control_t *) ctl;
1440 
1441  /* clear the contents */
1442  memset( pe_ctl, 0, sizeof ( pe_control_t ) );
1443 
1444  /* Set the domain */
1445  _pe_set_domain( ctl, _perf_event_vector.cmp_info.default_domain );
1446 
1447  /* default granularity */
1448  pe_ctl->granularity= _perf_event_vector.cmp_info.default_granularity;
1449 
1450  /* overflow signal */
1451  pe_ctl->overflow_signal=_perf_event_vector.cmp_info.hardware_intr_sig;
1452 
1453  pe_ctl->cidx=our_cidx;
1454 
1455  /* Set cpu number in the control block to show events */
1456  /* are not tied to specific cpu */
1457  pe_ctl->cpu = -1;
1458  return PAPI_OK;
1459 }
1460 
1461 /* Check the mmap page for rdpmc support */
1462 static int _pe_detect_rdpmc(int default_domain) {
1463 
1464  struct perf_event_attr pe;
1465  int fd,rdpmc_exists=1;
1466  void *addr;
1467  struct perf_event_mmap_page *our_mmap;
1468 
1469  /* Create a fake instructions event so we can read a mmap page */
1470  memset(&pe,0,sizeof(struct perf_event_attr));
1471 
1472  pe.type=PERF_TYPE_HARDWARE;
1473  pe.size=sizeof(struct perf_event_attr);
1474  pe.config=PERF_COUNT_HW_INSTRUCTIONS;
1475 
1476  /* There should probably be a helper function to handle this */
1477  /* we break on some ARM because there is no support for excluding */
1478  /* kernel. */
1479  if (default_domain & PAPI_DOM_KERNEL ) {
1480  }
1481  else {
1482  pe.exclude_kernel=1;
1483  }
1484  fd=sys_perf_event_open(&pe,0,-1,-1,0);
1485  if (fd<0) {
1486  return PAPI_ESYS;
1487  }
1488 
1489  /* create the mmap page */
1490  addr=mmap(NULL, 4096, PROT_READ, MAP_SHARED,fd,0);
1491  if (addr == (void *)(-1)) {
1492  close(fd);
1493  return PAPI_ESYS;
1494  }
1495 
1496  /* get the rdpmc info */
1497  our_mmap=(struct perf_event_mmap_page *)addr;
1498  if (our_mmap->cap_usr_rdpmc==0) {
1499  rdpmc_exists=0;
1500  }
1501 
1502  /* close the fake event */
1503  munmap(addr,4096);
1504  close(fd);
1505 
1506  return rdpmc_exists;
1507 
1508 }
1509 
1510 
1511 /* Initialize the perf_event component */
1512 int
1514 {
1515 
1516  int retval;
1517  int paranoid_level;
1518 
1519  FILE *fff;
1520 
1521  our_cidx=cidx;
1522 
1523  /* The is the official way to detect if perf_event support exists */
1524  /* The file is called perf_counter_paranoid on 2.6.31 */
1525  /* currently we are lazy and do not support 2.6.31 kernels */
1526  fff=fopen("/proc/sys/kernel/perf_event_paranoid","r");
1527  if (fff==NULL) {
1528  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1529  "perf_event support not detected",PAPI_MAX_STR_LEN);
1530  return PAPI_ENOCMP;
1531  }
1532 
1533  /* 2 means no kernel measurements allowed */
1534  /* 1 means normal counter access */
1535  /* 0 means you can access CPU-specific data */
1536  /* -1 means no restrictions */
1537  retval=fscanf(fff,"%d",&paranoid_level);
1538  if (retval!=1) fprintf(stderr,"Error reading paranoid level\n");
1539  fclose(fff);
1540 
1541  if ((paranoid_level==2) && (getuid()!=0)) {
1542  SUBDBG("/proc/sys/kernel/perf_event_paranoid prohibits kernel counts");
1543  _papi_hwd[cidx]->cmp_info.available_domains &=~PAPI_DOM_KERNEL;
1544  }
1545 
1546  /* Detect NMI watchdog which can steal counters */
1548  if (nmi_watchdog_active) {
1549  SUBDBG("The Linux nmi_watchdog is using one of the performance "
1550  "counters, reducing the total number available.\n");
1551  }
1552  /* Kernel multiplexing is broken prior to kernel 2.6.34 */
1553  /* The fix was probably git commit: */
1554  /* 45e16a6834b6af098702e5ea6c9a40de42ff77d8 */
1555  if (_papi_os_info.os_version < LINUX_VERSION(2,6,34)) {
1556  _papi_hwd[cidx]->cmp_info.kernel_multiplex = 0;
1557  _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PAPI_MAX_SW_MPX_EVENTS;
1558  }
1559  else {
1560  _papi_hwd[cidx]->cmp_info.kernel_multiplex = 1;
1561  _papi_hwd[cidx]->cmp_info.num_mpx_cntrs = PERF_EVENT_MAX_MPX_COUNTERS;
1562  }
1563 
1564  /* Check that processor is supported */
1567  PAPI_OK) {
1568  fprintf(stderr,"warning, your processor is unsupported\n");
1569  /* should not return error, as software events should still work */
1570  }
1571 
1572  /* Setup mmtimers, if appropriate */
1573  retval=mmtimer_setup();
1574  if (retval) {
1575  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1576  "Error initializing mmtimer",PAPI_MAX_STR_LEN);
1577  return retval;
1578  }
1579 
1580  /* Set the overflow signal */
1581  _papi_hwd[cidx]->cmp_info.hardware_intr_sig = SIGRTMIN + 2;
1582 
1583  /* Run Vendor-specific fixups */
1584  pe_vendor_fixups(_papi_hwd[cidx]);
1585 
1586  /* Detect if we can use rdpmc (or equivalent) */
1587  /* We currently do not use rdpmc as it is slower in tests */
1588  /* than regular read (as of Linux 3.5) */
1589  retval=_pe_detect_rdpmc(_papi_hwd[cidx]->cmp_info.default_domain);
1590  if (retval < 0 ) {
1591  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1592  "sys_perf_event_open() failed, perf_event support for this platform may be broken",PAPI_MAX_STR_LEN);
1593 
1594  return retval;
1595  }
1596  _papi_hwd[cidx]->cmp_info.fast_counter_read = retval;
1597 
1598  /* Run the libpfm4-specific setup */
1599  retval = _papi_libpfm4_init(_papi_hwd[cidx]);
1600  if (retval) {
1601  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1602  "Error initializing libpfm4",PAPI_MAX_STR_LEN);
1603  return retval;
1604  }
1605 
1606  retval = _pe_libpfm4_init(_papi_hwd[cidx], cidx,
1609  if (retval) {
1610  strncpy(_papi_hwd[cidx]->cmp_info.disabled_reason,
1611  "Error initializing libpfm4",PAPI_MAX_STR_LEN);
1612  return retval;
1613  }
1614 
1615  return PAPI_OK;
1616 
1617 }
1618 
1619 /* Shutdown the perf_event component */
1620 int
1622 
1623  /* deallocate our event table */
1625 
1626  /* Shutdown libpfm4 */
1628 
1629  return PAPI_OK;
1630 }
1631 
1632 
1633 
1634 
1635 int
1636 _pe_ntv_enum_events( unsigned int *PapiEventCode, int modifier )
1637 {
1638  return _pe_libpfm4_ntv_enum_events(PapiEventCode, modifier,
1640 }
1641 
1642 int
1643 _pe_ntv_name_to_code( char *name, unsigned int *event_code) {
1644  return _pe_libpfm4_ntv_name_to_code(name,event_code,
1646 }
1647 
1648 int
1649 _pe_ntv_code_to_name(unsigned int EventCode,
1650  char *ntv_name, int len) {
1651  return _pe_libpfm4_ntv_code_to_name(EventCode,
1652  ntv_name, len,
1654 }
1655 
1656 int
1657 _pe_ntv_code_to_descr( unsigned int EventCode,
1658  char *ntv_descr, int len) {
1659 
1660  return _pe_libpfm4_ntv_code_to_descr(EventCode,ntv_descr,len,
1662 }
1663 
1664 int
1665 _pe_ntv_code_to_info(unsigned int EventCode,
1666  PAPI_event_info_t *info) {
1667 
1668  return _pe_libpfm4_ntv_code_to_info(EventCode, info,
1670 }
1671 
1672 /* These functions are based on builtin-record.c in the */
1673 /* kernel's tools/perf directory. */
1674 
1675 static uint64_t
1677 {
1678  struct perf_event_mmap_page *pc = pe->mmap_buf;
1679  int head;
1680 
1681  if ( pc == NULL ) {
1682  PAPIERROR( "perf_event_mmap_page is NULL" );
1683  return 0;
1684  }
1685 
1686  head = pc->data_head;
1687  rmb( );
1688 
1689  return head;
1690 }
1691 
1692 static void
1694 {
1695  struct perf_event_mmap_page *pc = pe->mmap_buf;
1696 
1697  /* ensure all reads are done before we write the tail out. */
1698  pc->data_tail = tail;
1699 }
1700 
1701 
1702 /* Does the kernel define these somewhere? */
1703 struct ip_event {
1704  struct perf_event_header header;
1705  uint64_t ip;
1706 };
1707 struct lost_event {
1708  struct perf_event_header header;
1709  uint64_t id;
1710  uint64_t lost;
1711 };
1712 typedef union event_union {
1713  struct perf_event_header header;
1714  struct ip_event ip;
1717 
1718 /* Should re-write with comments if we ever figure out what's */
1719 /* going on here. */
1720 static void
1722  int profile_index )
1723 {
1724  uint64_t head = mmap_read_head( pe );
1725  uint64_t old = pe->tail;
1726  unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( );
1727  int diff;
1728 
1729  diff = head - old;
1730  if ( diff < 0 ) {
1731  SUBDBG( "WARNING: failed to keep up with mmap data. head = %" PRIu64
1732  ", tail = %" PRIu64 ". Discarding samples.\n", head, old );
1733  /* head points to a known good entry, start there. */
1734  old = head;
1735  }
1736 
1737  for( ; old != head; ) {
1739  & data[old & pe->mask];
1740  perf_sample_event_t event_copy;
1741  size_t size = event->header.size;
1742 
1743  /* Event straddles the mmap boundary -- header should always */
1744  /* be inside due to u64 alignment of output. */
1745  if ( ( old & pe->mask ) + size != ( ( old + size ) & pe->mask ) ) {
1746  uint64_t offset = old;
1747  uint64_t len = min( sizeof ( *event ), size ), cpy;
1748  void *dst = &event_copy;
1749 
1750  do {
1751  cpy = min( pe->mask + 1 - ( offset & pe->mask ), len );
1752  memcpy( dst, &data[offset & pe->mask], cpy );
1753  offset += cpy;
1754  dst = ((unsigned char*)dst) + cpy;
1755  len -= cpy;
1756  } while ( len );
1757 
1758  event = &event_copy;
1759  }
1760  old += size;
1761 
1762  SUBDBG( "event->type = %08x\n", event->header.type );
1763  SUBDBG( "event->size = %d\n", event->header.size );
1764 
1765  switch ( event->header.type ) {
1766  case PERF_RECORD_SAMPLE:
1767  _papi_hwi_dispatch_profile( ( *thr )->running_eventset[cidx],
1768  ( caddr_t ) ( unsigned long ) event->ip.ip,
1769  0, profile_index );
1770  break;
1771 
1772  case PERF_RECORD_LOST:
1773  SUBDBG( "Warning: because of a mmap buffer overrun, %" PRId64
1774  " events were lost.\n"
1775  "Loss was recorded when counter id 0x%"PRIx64
1776  " overflowed.\n", event->lost.lost, event->lost.id );
1777  break;
1778 
1779  default:
1780  SUBDBG( "Error: unexpected header type - %d\n",
1781  event->header.type );
1782  break;
1783  }
1784  }
1785 
1786  pe->tail = old;
1787  mmap_write_tail( pe, old );
1788 }
1789 
1790 /* Find a native event specified by a profile index */
1791 static int
1792 find_profile_index( EventSetInfo_t *ESI, int evt_idx, int *flags,
1793  unsigned int *native_index, int *profile_index )
1794 {
1795  int pos, esi_index, count;
1796 
1797  for ( count = 0; count < ESI->profile.event_counter; count++ ) {
1798  esi_index = ESI->profile.EventIndex[count];
1799  pos = ESI->EventInfoArray[esi_index].pos[0];
1800 
1801  if ( pos == evt_idx ) {
1802  *profile_index = count;
1803  *native_index = ESI->NativeInfoArray[pos].ni_event &
1805  *flags = ESI->profile.flags;
1806  SUBDBG( "Native event %d is at profile index %d, flags %d\n",
1807  *native_index, *profile_index, *flags );
1808  return PAPI_OK;
1809  }
1810  }
1811  PAPIERROR( "wrong count: %d vs. ESI->profile.event_counter %d", count,
1812  ESI->profile.event_counter );
1813  return PAPI_EBUG;
1814 }
1815 
1816 
1817 
1818 /* What exactly does this do? */
1819 static int
1820 process_smpl_buf( int evt_idx, ThreadInfo_t **thr, int cidx )
1821 {
1822  int ret, flags, profile_index;
1823  unsigned native_index;
1824  pe_control_t *ctl;
1825 
1826  ret = find_profile_index( ( *thr )->running_eventset[cidx], evt_idx,
1827  &flags, &native_index, &profile_index );
1828  if ( ret != PAPI_OK ) {
1829  return ret;
1830  }
1831 
1832  ctl= (*thr)->running_eventset[cidx]->ctl_state;
1833 
1834  mmap_read( cidx, thr,
1835  &(ctl->events[evt_idx]),
1836  profile_index );
1837 
1838  return PAPI_OK;
1839 }
1840 
1841 /*
1842  * This function is used when hardware overflows are working or when
1843  * software overflows are forced
1844  */
1845 
1846 void
1847 _pe_dispatch_timer( int n, hwd_siginfo_t *info, void *uc)
1848 {
1849  ( void ) n; /*unused */
1850  _papi_hwi_context_t hw_context;
1851  int found_evt_idx = -1, fd = info->si_fd;
1852  caddr_t address;
1854  int i;
1855  pe_control_t *ctl;
1856  int cidx = _perf_event_vector.cmp_info.CmpIdx;
1857 
1858  if ( thread == NULL ) {
1859  PAPIERROR( "thread == NULL in _papi_pe_dispatch_timer for fd %d!", fd );
1860  return;
1861  }
1862 
1863  if ( thread->running_eventset[cidx] == NULL ) {
1864  PAPIERROR( "thread->running_eventset == NULL in "
1865  "_papi_pe_dispatch_timer for fd %d!",fd );
1866  return;
1867  }
1868 
1869  if ( thread->running_eventset[cidx]->overflow.flags == 0 ) {
1870  PAPIERROR( "thread->running_eventset->overflow.flags == 0 in "
1871  "_papi_pe_dispatch_timer for fd %d!", fd );
1872  return;
1873  }
1874 
1875  hw_context.si = info;
1876  hw_context.ucontext = ( hwd_ucontext_t * ) uc;
1877 
1878  if ( thread->running_eventset[cidx]->overflow.flags &
1880  address = GET_OVERFLOW_ADDRESS( hw_context );
1881  _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
1882  address, NULL, 0,
1883  0, &thread, cidx );
1884  return;
1885  }
1886 
1887  if ( thread->running_eventset[cidx]->overflow.flags !=
1889  PAPIERROR( "thread->running_eventset->overflow.flags is set to "
1890  "something other than PAPI_OVERFLOW_HARDWARE or "
1891  "PAPI_OVERFLOW_FORCE_SW for fd %d (%#x)",
1892  fd , thread->running_eventset[cidx]->overflow.flags);
1893  }
1894 
1895  /* convoluted way to get ctl */
1896  ctl= thread->running_eventset[cidx]->ctl_state;
1897 
1898  /* See if the fd is one that's part of the this thread's context */
1899  for( i=0; i < ctl->num_events; i++ ) {
1900  if ( fd == ctl->events[i].event_fd ) {
1901  found_evt_idx = i;
1902  break;
1903  }
1904  }
1905 
1906  if ( found_evt_idx == -1 ) {
1907  PAPIERROR( "Unable to find fd %d among the open event fds "
1908  "_papi_hwi_dispatch_timer!", fd );
1909  return;
1910  }
1911 
1912  if (ioctl( fd, PERF_EVENT_IOC_DISABLE, NULL ) == -1 ) {
1913  PAPIERROR("ioctl(PERF_EVENT_IOC_DISABLE) failed.\n");
1914  }
1915 
1916  if ( ( thread->running_eventset[cidx]->state & PAPI_PROFILING ) &&
1917  !( thread->running_eventset[cidx]->profile.flags &
1918  PAPI_PROFIL_FORCE_SW ) ) {
1919  process_smpl_buf( found_evt_idx, &thread, cidx );
1920  }
1921  else {
1922  uint64_t ip;
1923  unsigned int head;
1924  pe_event_info_t *pe = &(ctl->events[found_evt_idx]);
1925  unsigned char *data = ((unsigned char*)pe->mmap_buf) + getpagesize( );
1926 
1927  /*
1928  * Read up the most recent IP from the sample in the mmap buffer. To
1929  * do this, we make the assumption that all of the records in the
1930  * mmap buffer are the same size, and that they all contain the IP as
1931  * their only record element. This means that we can use the
1932  * data_head element from the user page and move backward one record
1933  * from that point and read the data. Since we don't actually need
1934  * to access the header of the record, we can just subtract 8 (size
1935  * of the IP) from data_head and read up that word from the mmap
1936  * buffer. After we subtract 8, we account for mmap buffer wrapping
1937  * by AND'ing this offset with the buffer mask.
1938  */
1939  head = mmap_read_head( pe );
1940 
1941  if ( head == 0 ) {
1942  PAPIERROR( "Attempting to access memory which may be inaccessable" );
1943  return;
1944  }
1945  ip = *( uint64_t * ) ( data + ( ( head - 8 ) & pe->mask ) );
1946  /*
1947  * Update the tail to the current head pointer.
1948  *
1949  * Note: that if we were to read the record at the tail pointer,
1950  * rather than the one at the head (as you might otherwise think
1951  * would be natural), we could run into problems. Signals don't
1952  * stack well on Linux, particularly if not using RT signals, and if
1953  * they come in rapidly enough, we can lose some. Overtime, the head
1954  * could catch up to the tail and monitoring would be stopped, and
1955  * since no more signals are coming in, this problem will never be
1956  * resolved, resulting in a complete loss of overflow notification
1957  * from that point on. So the solution we use here will result in
1958  * only the most recent IP value being read every time there are two
1959  * or more samples in the buffer (for that one overflow signal). But
1960  * the handler will always bring up the tail, so the head should
1961  * never run into the tail.
1962  */
1963  mmap_write_tail( pe, head );
1964 
1965  /*
1966  * The fourth parameter is supposed to be a vector of bits indicating
1967  * the overflowed hardware counters, but it's not really clear that
1968  * it's useful, because the actual hardware counters used are not
1969  * exposed to the PAPI user. For now, I'm just going to set the bit
1970  * that indicates which event register in the array overflowed. The
1971  * result is that the overflow vector will not be identical to the
1972  * perfmon implementation, and part of that is due to the fact that
1973  * which hardware register is actually being used is opaque at the
1974  * user level (the kernel event dispatcher hides that info).
1975  */
1976 
1977  _papi_hwi_dispatch_overflow_signal( ( void * ) &hw_context,
1978  ( caddr_t ) ( unsigned long ) ip,
1979  NULL, ( 1 << found_evt_idx ), 0,
1980  &thread, cidx );
1981 
1982  }
1983 
1984  /* Restart the counters */
1985  if (ioctl( fd, PERF_EVENT_IOC_REFRESH, PAPI_REFRESH_VALUE ) == -1) {
1986  PAPIERROR( "overflow refresh failed", 0 );
1987  }
1988 }
1989 
1990 /* Stop profiling */
1991 int
1993 {
1994  int i, ret = PAPI_OK;
1995  pe_control_t *ctl;
1996  int cidx;
1997 
1998  ctl=ESI->ctl_state;
1999 
2000  cidx=ctl->cidx;
2001 
2002  /* Loop through all of the events and process those which have mmap */
2003  /* buffers attached. */
2004  for ( i = 0; i < ctl->num_events; i++ ) {
2005  /* Use the mmap_buf field as an indicator of this fd being used for */
2006  /* profiling. */
2007  if ( ctl->events[i].mmap_buf ) {
2008  /* Process any remaining samples in the sample buffer */
2009  ret = process_smpl_buf( i, &thread, cidx );
2010  if ( ret ) {
2011  PAPIERROR( "process_smpl_buf returned error %d", ret );
2012  return ret;
2013  }
2014  }
2015  }
2016  return ret;
2017 }
2018 
2019 /* Setup an event to cause overflow */
2020 int
2021 _pe_set_overflow( EventSetInfo_t *ESI, int EventIndex, int threshold )
2022 {
2023 
2024  pe_context_t *ctx;
2025  pe_control_t *ctl = (pe_control_t *) ( ESI->ctl_state );
2026  int i, evt_idx, found_non_zero_sample_period = 0, retval = PAPI_OK;
2027  int cidx;
2028 
2029  cidx = ctl->cidx;
2030  ctx = ( pe_context_t *) ( ESI->master->context[cidx] );
2031 
2032  evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
2033 
2034  SUBDBG("Attempting to set overflow for index %d (%d) of EventSet %d\n",
2035  evt_idx,EventIndex,ESI->EventSetIndex);
2036 
2037  if (evt_idx<0) {
2038  return PAPI_EINVAL;
2039  }
2040 
2041  if ( threshold == 0 ) {
2042  /* If this counter isn't set to overflow, it's an error */
2043  if ( ctl->events[evt_idx].attr.sample_period == 0 ) return PAPI_EINVAL;
2044  }
2045 
2046  ctl->events[evt_idx].attr.sample_period = threshold;
2047 
2048  /*
2049  * Note that the wakeup_mode field initially will be set to zero
2050  * (WAKEUP_MODE_COUNTER_OVERFLOW) as a result of a call to memset 0 to
2051  * all of the events in the ctl struct.
2052  *
2053  * Is it even set to any other value elsewhere?
2054  */
2055  switch ( ctl->events[evt_idx].wakeup_mode ) {
2056  case WAKEUP_MODE_PROFILING:
2057  /* Setting wakeup_events to special value zero means issue a */
2058  /* wakeup (signal) on every mmap page overflow. */
2059  ctl->events[evt_idx].attr.wakeup_events = 0;
2060  break;
2061 
2063  /* Can this code ever be called? */
2064 
2065  /* Setting wakeup_events to one means issue a wakeup on every */
2066  /* counter overflow (not mmap page overflow). */
2067  ctl->events[evt_idx].attr.wakeup_events = 1;
2068  /* We need the IP to pass to the overflow handler */
2069  ctl->events[evt_idx].attr.sample_type = PERF_SAMPLE_IP;
2070  /* one for the user page, and two to take IP samples */
2071  ctl->events[evt_idx].nr_mmap_pages = 1 + 2;
2072  break;
2073  default:
2074  PAPIERROR( "ctl->wakeup_mode[%d] set to an unknown value - %u",
2075  evt_idx, ctl->events[evt_idx].wakeup_mode);
2076  return PAPI_EBUG;
2077  }
2078 
2079  /* Check for non-zero sample period */
2080  for ( i = 0; i < ctl->num_events; i++ ) {
2081  if ( ctl->events[evt_idx].attr.sample_period ) {
2082  found_non_zero_sample_period = 1;
2083  break;
2084  }
2085  }
2086 
2087  if ( found_non_zero_sample_period ) {
2088  /* turn on internal overflow flag for this event set */
2089  ctl->overflow = 1;
2090 
2091  /* Enable the signal handler */
2093  ctl->overflow_signal,
2094  1, ctl->cidx );
2095  } else {
2096  /* turn off internal overflow flag for this event set */
2097  ctl->overflow = 0;
2098 
2099  /* Remove the signal handler, if there are no remaining non-zero */
2100  /* sample_periods set */
2102  if ( retval != PAPI_OK ) return retval;
2103  }
2104 
2105  retval = _pe_update_control_state( ctl, NULL,
2106  ( (pe_control_t *) (ESI->ctl_state) )->num_events,
2107  ctx );
2108 
2109  return retval;
2110 }
2111 
2112 /* Enable profiling */
2113 int
2114 _pe_set_profile( EventSetInfo_t *ESI, int EventIndex, int threshold )
2115 {
2116  int ret;
2117  int evt_idx;
2118  pe_control_t *ctl = ( pe_control_t *) ( ESI->ctl_state );
2119 
2120  /* Since you can't profile on a derived event, the event is always the */
2121  /* first and only event in the native event list. */
2122  evt_idx = ESI->EventInfoArray[EventIndex].pos[0];
2123 
2124  if ( threshold == 0 ) {
2125  SUBDBG( "MUNMAP(%p,%"PRIu64")\n", ctl->events[evt_idx].mmap_buf,
2126  ( uint64_t ) ctl->events[evt_idx].nr_mmap_pages *
2127  getpagesize( ) );
2128 
2129  if ( ctl->events[evt_idx].mmap_buf ) {
2130  munmap( ctl->events[evt_idx].mmap_buf,
2131  ctl->events[evt_idx].nr_mmap_pages * getpagesize() );
2132  }
2133  ctl->events[evt_idx].mmap_buf = NULL;
2134  ctl->events[evt_idx].nr_mmap_pages = 0;
2135  ctl->events[evt_idx].attr.sample_type &= ~PERF_SAMPLE_IP;
2136  ret = _pe_set_overflow( ESI, EventIndex, threshold );
2137  /* ??? #warning "This should be handled somewhere else" */
2138  ESI->state &= ~( PAPI_OVERFLOWING );
2139  ESI->overflow.flags &= ~( PAPI_OVERFLOW_HARDWARE );
2140 
2141  return ret;
2142  }
2143 
2144  /* Look up the native event code */
2146  /* Not supported yet... */
2147 
2148  return PAPI_ENOSUPP;
2149  }
2150  if ( ESI->profile.flags & PAPI_PROFIL_RANDOM ) {
2151  /* This requires an ability to randomly alter the sample_period within */
2152  /* a given range. Kernel does not have this ability. FIXME */
2153  return PAPI_ENOSUPP;
2154  }
2155 
2156  /* Just a guess at how many pages would make this relatively efficient. */
2157  /* Note that it's "1 +" because of the need for a control page, and the */
2158  /* number following the "+" must be a power of 2 (1, 4, 8, 16, etc) or */
2159  /* zero. This is required to optimize dealing with circular buffer */
2160  /* wrapping of the mapped pages. */
2161 
2162  ctl->events[evt_idx].nr_mmap_pages = (1+8);
2163  ctl->events[evt_idx].attr.sample_type |= PERF_SAMPLE_IP;
2164 
2165  ret = _pe_set_overflow( ESI, EventIndex, threshold );
2166  if ( ret != PAPI_OK ) return ret;
2167 
2168  return PAPI_OK;
2169 }
2170 
2171 
2172 /* Our component vector */
2173 
2174 papi_vector_t _perf_event_vector = {
2175  .cmp_info = {
2176  /* component information (unspecified values initialized to 0) */
2177  .name = "perf_event",
2178  .short_name = "perf",
2179  .version = "5.0",
2180  .description = "Linux perf_event CPU counters",
2181 
2182  .default_domain = PAPI_DOM_USER,
2183  .available_domains = PAPI_DOM_USER | PAPI_DOM_KERNEL | PAPI_DOM_SUPERVISOR,
2184  .default_granularity = PAPI_GRN_THR,
2185  .available_granularities = PAPI_GRN_THR | PAPI_GRN_SYS,
2186 
2187  .hardware_intr = 1,
2188  .kernel_profile = 1,
2189 
2190  /* component specific cmp_info initializations */
2191  .fast_virtual_timer = 0,
2192  .attach = 1,
2193  .attach_must_ptrace = 1,
2194  .cpu = 1,
2195  .inherit = 1,
2196  .cntr_umasks = 1,
2197 
2198  },
2199 
2200  /* sizes of framework-opaque component-private structures */
2201  .size = {
2202  .context = sizeof ( pe_context_t ),
2203  .control_state = sizeof ( pe_control_t ),
2204  .reg_value = sizeof ( int ),
2205  .reg_alloc = sizeof ( int ),
2206  },
2207 
2208  /* function pointers in this component */
2209  .init_component = _pe_init_component,
2210  .shutdown_component = _pe_shutdown_component,
2211  .init_thread = _pe_init_thread,
2212  .init_control_state = _pe_init_control_state,
2213  .dispatch_timer = _pe_dispatch_timer,
2214 
2215  /* function pointers from the shared perf_event lib */
2216  .start = _pe_start,
2217  .stop = _pe_stop,
2218  .read = _pe_read,
2219  .shutdown_thread = _pe_shutdown_thread,
2220  .ctl = _pe_ctl,
2221  .update_control_state = _pe_update_control_state,
2222  .set_domain = _pe_set_domain,
2223  .reset = _pe_reset,
2224  .set_overflow = _pe_set_overflow,
2225  .set_profile = _pe_set_profile,
2226  .stop_profiling = _pe_stop_profiling,
2227  .write = _pe_write,
2228 
2229 
2230  /* from counter name mapper */
2231  .ntv_enum_events = _pe_ntv_enum_events,
2232  .ntv_name_to_code = _pe_ntv_name_to_code,
2233  .ntv_code_to_name = _pe_ntv_code_to_name,
2234  .ntv_code_to_descr = _pe_ntv_code_to_descr,
2235  .ntv_code_to_info = _pe_ntv_code_to_info,
2236 };
char name[PAPI_MAX_STR_LEN]
Definition: papi.h:625
int _pe_libpfm4_setup_counters(struct perf_event_attr *attr, int event, struct native_event_table_t *event_table)
i inherit inherit
void _pe_dispatch_timer(int n, hwd_siginfo_t *info, void *uc)
Definition: perf_event.c:1847
ssize_t read(int fd, void *buf, size_t count)
Definition: appio.c:225
memset(eventId, 0, size)
long long counts[PERF_EVENT_MAX_MPX_COUNTERS]
int _pe_shutdown_thread(hwd_context_t *ctx)
Definition: perf_event.c:842
_papi_int_inherit_t inherit
static int process_smpl_buf(int evt_idx, ThreadInfo_t **thr, int cidx)
Definition: perf_event.c:1820
int errno
int close(int fd)
Definition: appio.c:175
#define PAPI_ENOMEM
Definition: fpapi.h:107
#define PAPI_GRN_SYS_CPU
Definition: fpapi.h:72
int _pe_stop(hwd_context_t *ctx, hwd_control_state_t *ctl)
Definition: perf_event.c:1154
#define PAPI_CPU_ATTACH
Definition: papi.h:455
int _pe_ntv_code_to_descr(unsigned int EventCode, char *ntv_descr, int len)
Definition: perf_event.c:1657
int _pe_reset(hwd_context_t *ctx, hwd_control_state_t *ctl)
Definition: perf_event.c:856
#define PERF_EVENT_MAX_MPX_COUNTERS
Definition: perf_event_lib.h:5
EventSetInfo_t * ESI
static int close_pe_events(pe_context_t *ctx, pe_control_t *ctl)
Definition: perf_event.c:720
int _pe_libpfm4_ntv_enum_events(unsigned int *PapiEventCode, int modifier, struct native_event_table_t *event_table)
unsigned int granularity
long long flags
Definition: iozone.c:12330
#define PAPI_DEF_ITIMER_NS
Definition: papi.h:453
int _pe_libpfm4_ntv_name_to_code(char *name, unsigned int *event_code, struct native_event_table_t *event_table)
int _pe_ntv_enum_events(unsigned int *PapiEventCode, int modifier)
Definition: perf_event.c:1636
EventSetInfo_t * ESI
int _papi_libpfm4_init(papi_vector_t *my_vector)
struct in_addr * ip
Definition: iozone.c:20416
int _pe_set_domain(hwd_control_state_t *ctl, int domain)
Definition: perf_event.c:817
#define PAPI_INSTR_ADDRESS
Definition: papi.h:451
gc head
Definition: libasync.c:669
#define PAPI_MAX_STR_LEN
Definition: fpapi.h:43
#define PAPI_PROFIL_DATA_EAR
Definition: papi.h:402
cpu
Definition: iozone.c:3872
_papi_int_addr_range_t address_range
static int bug_check_scheduability(void)
Definition: perf_event.c:158
#define READ_BUFFER_SIZE
Definition: perf_event.c:439
static long sys_perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags)
Definition: perf_event.c:298
int _pe_ntv_code_to_info(unsigned int EventCode, PAPI_event_info_t *info)
Definition: perf_event.c:1665
int default_granularity
Definition: papi.h:641
unsigned int wakeup_mode
#define PAPI_ENOEVNT
Definition: fpapi.h:112
off64_t offset
Definition: iozone.c:1279
#define PAPI_DATA_ADDRESS
Definition: papi.h:450
int _pe_libpfm4_shutdown(struct native_event_table_t *event_table)
int fd
Definition: iozone.c:1291
#define PAPI_EPERM
Definition: fpapi.h:120
#define PAPI_REFRESH_VALUE
Definition: perf_event.c:79
static int bug_format_group(void)
Definition: perf_event.c:180
EventSetInfo_t * ESI
device[deviceId] domain[domainId] event
Definition: linux-cuda.c:306
static int set_irange(hwd_context_t *ctx, hwd_control_state_t *current_state, _papi_int_option_t *option)
Definition: perfmon-ia64.c:919
struct perf_event_header header
Definition: perf_event.c:1704
pe_event_info_t events[PERF_EVENT_MAX_MPX_COUNTERS]
#define PERF_EVENTS_RUNNING
Definition: perf_event.c:50
#define PAPI_PROFIL_RANDOM
Definition: fpapi.h:76
return PAPI_OK
Definition: linux-nvml.c:458
int count
Definition: iozone.c:22422
#define PAPI_ENOCMP
Definition: fpapi.h:122
#define PMU_TYPE_OS
static int find_profile_index(EventSetInfo_t *ESI, int evt_idx, int *flags, unsigned int *native_index, int *profile_index)
Definition: perf_event.c:1792
#define PAPI_GRN_SYS
Definition: fpapi.h:71
#define PAPI_DOM_OTHER
Definition: fpapi.h:23
static pid_t mygettid(void)
Definition: darwin-common.h:11
int _pe_init_control_state(hwd_control_state_t *ctl)
Definition: perf_event.c:1437
fclose(thread_wqfd)
#define PAPI_DOM_KERNEL
Definition: fpapi.h:22
#define PAPI_GRN_PROC
Definition: fpapi.h:69
void
Definition: iozone.c:18627
return PAPI_EINVAL
Definition: linux-nvml.c:408
int _pe_libpfm4_init(papi_vector_t *my_vector, int cidx, struct native_event_table_t *event_table, int pmu_type)
PAPI_component_info_t cmp_info
Definition: papi_vector.h:20
#define PAPI_MAX_SW_MPX_EVENTS
Definition: sw_multiplex.h:4
static int check_permissions(unsigned long tid, unsigned int cpu_num, unsigned int domain, unsigned int granularity, unsigned int multiplex, unsigned int inherit)
Definition: perf_event.c:380
static int _pe_detect_rdpmc(int default_domain)
Definition: perf_event.c:1462
static int set_default_domain(EventSetInfo_t *zero, int domain)
Definition: aix.c:510
struct perf_event_header header
Definition: perf_event.c:1708
#define PAPI_OVERFLOWING
Definition: fpapi.h:33
static int processor_supported(int vendor, int family)
Definition: perf_event.c:88
#define PAPI_PROFILING
Definition: fpapi.h:34
#define PAPI_INHERIT
Definition: papi.h:456
#define PAPI_ECNFLCT
Definition: fpapi.h:113
Return codes and api definitions.
uint32_t nr_mmap_pages
FILE * fff[MAX_EVENTS]
unsigned int domain
int multiplex(void)
Definition: multiplex.c:35
_papi_int_attach_t attach
int _pe_shutdown_component(void)
Definition: perf_event.c:1621
long long ret
Definition: iozone.c:1346
unsigned int overflow
unsigned long tid
int _pe_libpfm4_ntv_code_to_name(unsigned int EventCode, char *ntv_name, int len, struct native_event_table_t *event_table)
papi_vector_t _perf_event_vector
Definition: perf_event.c:56
int _pe_update_control_state(hwd_control_state_t *ctl, NativeInfo_t *native, int count, hwd_context_t *ctx)
Definition: perf_event.c:1185
_papi_int_cpu_t cpu
int i
Definition: fileop.c:140
#define PAPI_ENOSUPP
Definition: fpapi.h:123
EventSetOverflowInfo_t overflow
int _papi_libpfm4_shutdown(void)
#define PAPI_GRN_PROCG
Definition: fpapi.h:70
#define PAPI_OVERFLOW_HARDWARE
Definition: papi.h:410
unsigned int fast_real_timer
Definition: papi.h:655
PAPI_os_info_t _papi_os_info
Definition: aix.c:1210
struct _ThreadInfo * master
#define PAPI_VENDOR_IBM
Definition: papi.h:348
int _pe_read(hwd_context_t *ctx, hwd_control_state_t *ctl, long long **events, int flags)
Definition: perf_event.c:911
#define WAKEUP_MODE_PROFILING
Definition: perf_event.c:69
#define PAPI_DOM_SUPERVISOR
Definition: fpapi.h:24
static int pe_vendor_fixups(papi_vector_t *vector)
Definition: perf_event.c:102
static int pid
char *long long size
Definition: iozone.c:12023
int _pe_set_overflow(EventSetInfo_t *ESI, int EventIndex, int threshold)
Definition: perf_event.c:2021
int _pe_stop_profiling(ThreadInfo_t *thread, EventSetInfo_t *ESI)
Definition: perf_event.c:1992
static int cidx
Definition: event_info.c:40
static int check_scheduability(pe_context_t *ctx, pe_control_t *ctl, int idx)
Definition: perf_event.c:449
#define PAPI_ECMP
Definition: fpapi.h:109
hwd_ucontext_t * ucontext
static int native
Definition: event_info.c:39
void * thread(void *arg)
Definition: kufrin.c:31
#define PERF_EVENTS_OPENED
Definition: perf_event.c:49
void *long long tid
Definition: iozone.c:18586
int _pe_ntv_name_to_code(char *name, unsigned int *event_code)
Definition: perf_event.c:1643
#define SUBDBG(format, args...)
Definition: papi_debug.h:63
#define PAPI_PROFIL_INST_EAR
Definition: papi.h:403
#define PAPI_VENDOR_MIPS
Definition: papi.h:353
_papi_int_granularity_t granularity
static void mmap_read(int cidx, ThreadInfo_t **thr, pe_event_info_t *pe, int profile_index)
Definition: perf_event.c:1721
EventSetInfo_t * ESI
void PAPIERROR(char *format,...)
unsigned int multiplexed
int _papi_hwi_start_signal(int signal, int need_context, int cidx)
Definition: extras.c:401
#define PAPI_DOMAIN
Definition: fpapi.h:50
char events[MAX_EVENTS][BUFSIZ]
static uint64_t mmap_read_head(pe_event_info_t *pe)
Definition: perf_event.c:1676
int mmtimer_setup(void)
Definition: linux-timer.c:116
struct native_event_table_t * event_table
#define PAPI_VENDOR_ARM
Definition: papi.h:352
#define min(x, y)
Definition: darwin-common.h:4
#define PAPI_ATTACH
Definition: fpapi.h:62
#define PMU_TYPE_CORE
int _papi_hwi_stop_signal(int signal)
Definition: extras.c:441
#define WAKEUP_MODE_COUNTER_OVERFLOW
Definition: perf_event.c:68
EventSetInfo_t * ESI
#define PAPI_GRANUL
Definition: fpapi.h:52
_papi_int_multiplex_t multiplex
char * addr
Definition: iozone.c:12026
NativeInfo_t * NativeInfoArray
uint64_t id
Definition: perf_event.c:1709
EventInfo_t * EventInfoArray
int cpuid_family
Definition: papi.h:787
#define PAPI_DEF_MPX_NS
Definition: fpapi.h:53
#define PAPI_ESYS
Definition: fpapi.h:108
int threshold
int _pe_start(hwd_context_t *ctx, hwd_control_state_t *ctl)
Definition: perf_event.c:1110
#define PAPI_VENDOR_CRAY
Definition: papi.h:349
static int bug_sync_read(void)
Definition: perf_event.c:200
papi_mdi_t _papi_hwi_system_info
Definition: papi_internal.c:57
PAPI_hw_info_t hw_info
again struct sockaddr sizeof(struct sockaddr_in))
unsigned int overflow_signal
#define PAPI_DETACH
Definition: fpapi.h:66
int _pe_libpfm4_ntv_code_to_info(unsigned int EventCode, PAPI_event_info_t *info, struct native_event_table_t *event_table)
#define PAPI_VENDOR_INTEL
Definition: papi.h:346
int pos[PAPI_EVENTS_IN_DERIVED_EVENT]
static void mmap_write_tail(pe_event_info_t *pe, uint64_t tail)
Definition: perf_event.c:1693
int _pe_libpfm4_ntv_code_to_descr(unsigned int EventCode, char *ntv_descr, int len, struct native_event_table_t *event_table)
#define LINUX_VERSION(a, b, c)
Definition: linux-common.h:4
uint64_t ip
Definition: perf_event.c:1705
int _pe_init_component(int cidx)
Definition: perf_event.c:1513
int _pe_ntv_code_to_name(unsigned int EventCode, char *ntv_name, int len)
Definition: perf_event.c:1649
int vendor
Definition: papi.h:782
struct sigcontext hwd_ucontext_t
Definition: aix-context.h:10
unsigned int cpu_num
static int map_perf_event_errors_to_papi(int perf_event_error)
Definition: perf_event.c:335
#define PAPI_EBUG
Definition: fpapi.h:111
#define PAPI_OVERFLOW_FORCE_SW
Definition: papi.h:409
static int fcntl_setown_fd(int fd)
Definition: perf_event.c:214
static int set_drange(hwd_context_t *ctx, hwd_control_state_t *current_state, _papi_int_option_t *option)
Definition: perfmon-ia64.c:767
#define PAPI_DEF_ITIMER
Definition: papi.h:452
EventSetInfo_t ** running_eventset
Definition: threads.h:30
char * name
Definition: iozone.c:23648
void _papi_hwi_dispatch_profile(EventSetInfo_t *ESI, caddr_t pc, long long over, int profile_index)
Definition: extras.c:163
struct perf_event_attr attr
struct native_event_table_t perf_native_event_table
Definition: perf_event.c:59
int
Definition: iozone.c:18528
int _pe_ctl(hwd_context_t *ctx, int code, _papi_int_option_t *option)
Definition: perf_event.c:1249
#define MAP_FAILED
Definition: iozone.c:336
int our_cidx
Definition: perf_event.c:60
inline_static ThreadInfo_t * _papi_hwi_lookup_thread(int custom_tid)
Definition: threads.h:92
#define PAPI_NATIVE_AND_MASK
uint64_t lost
Definition: perf_event.c:1710
#define PAPI_PROFIL_FORCE_SW
Definition: papi.h:401
int _pe_init_thread(hwd_context_t *hwd_ctx)
Definition: perf_event.c:1421
unsigned int inherit
struct papi_vectors * _papi_hwd[]
int _linux_detect_nmi_watchdog()
Definition: linux-common.c:584
#define F_OWNER_TID
Definition: linux-common.h:28
_papi_int_domain_t domain
gc tail
Definition: libasync.c:667
char model_string[PAPI_MAX_STR_LEN]
Definition: papi.h:785
int nmi_watchdog_active
Definition: perf_event.c:53
hwd_siginfo_t * si
static unsigned int get_read_format(unsigned int multiplex, unsigned int inherit, int format_group)
Definition: perf_event.c:248
#define PAPI_DOM_USER
Definition: fpapi.h:21
#define F_SETOWN_EX
Definition: linux-common.h:25
int _papi_hwi_dispatch_overflow_signal(void *papiContext, caddr_t address, int *isHardware, long long overflow_bit, int genOverflowBit, ThreadInfo_t **t, int cidx)
Definition: extras.c:214
EventSetInfo_t * ESI
EventSetProfileInfo_t profile
hwd_control_state_t * ctl_state
long j
Definition: iozone.c:19135
ssize_t retval
Definition: libasync.c:338
int _pe_write(hwd_context_t *ctx, hwd_control_state_t *ctl, long long *from)
Definition: perf_event.c:881
#define GET_OVERFLOW_ADDRESS(ctx)
Definition: aix-context.h:12
int _pe_set_profile(EventSetInfo_t *ESI, int EventIndex, int threshold)
Definition: perf_event.c:2114
#define PAPI_GRN_THR
Definition: fpapi.h:67
static int open_pe_events(pe_context_t *ctx, pe_control_t *ctl)
Definition: perf_event.c:595
if(gettimeofday(&tp,(struct timezone *) NULL)==-1) perror("gettimeofday")
#define PAPI_MULTIPLEX
Definition: fpapi.h:48
EventSetInfo_t * ESI
#define PAPI_VENDOR_AMD
Definition: papi.h:347
int n
Definition: mendes-alt.c:164
static int tune_up_fd(pe_control_t *ctl, int evt_idx)
Definition: perf_event.c:527