I have started playing a bit with components and tried the infiniband component. I am not a ibverbs programmer or even network coder so I am having some problem getting off the ground. I am using 5.1.0-2.
First I have a problem in that the node I am testing on has two IB ports, but only one active. Initially the papi_component_avail failed because of this. I have done the attached patch to get beyond that, but I can not get an example such as papi_command_line infiniband:::mlx4_0_1_recv to work either despite or because my patches

(gdb) where
#0 0x00000039c4e081a0 in mad_get_retries () from /usr/lib64/libibmad.so.5
#1 0x00000039c4e0a06b in mad_rpc () from /usr/lib64/libibmad.so.5
#2 0x00000039c4e0ab0c in pma_query_via () from /usr/lib64/libibmad.so.5
#3 0x0000000000427ed8 in read_ib_counter (ctx=0x0, ctrl=0x7fffa0dbf5f0) at components/infiniband/linux-infiniband.c:267
#4 host_read_values (ctx=0x0, ctrl=0x7fffa0dbf5f0) at components/infiniband/linux-infiniband.c:316
#5 INFINIBAND_start (ctx=0x0, ctrl=0x7fffa0dbf5f0) at components/infiniband/linux-infiniband.c:590
#6 0x00000000004092d2 in PAPI_start (EventSet=0) at papi.c:2169
#7 0x00000000004027c3 in main (argc=0, argv=0x7fffa0dbf5f0) at command_line.c:113
Should this work? Who owns the component that I can discuss with to see what is going on?
What I tried to achieve with my patch was to allow for only some ports to be initialized. So I introduced a new state portstatus->is_initialized == -1 to indicate "I tried that port, but can't use it".
- Code: Select all
diff -rup ../papi-5.1.0/src/components/infiniband/linux-infiniband.c rapl_net_infiniband_mx/src/components/infiniband/linux-infiniband.c
--- ../papi-5.1.0/src/components/infiniband/linux-infiniband.c 2013-01-15 15:44:39.000000000 -0500
+++ rapl_net_infiniband_mx/src/components/infiniband/linux-infiniband.c 2013-03-08 06:08:15.504084587 -0500
@@ -1,3 +1,5 @@
+#define VERBOSE 1
+//#undef VERBOSE
/****************************/
/* THIS IS OPEN SOURCE CODE */
/****************************/
@@ -184,6 +186,9 @@ addIBPort( const char *ca_name, umad_por
static int
init_ib_port( ib_port * portdata )
{
+ ib_portid_t old_portid=portid;
+ int old_ibportnum=ibportnum;
+
int mgmt_classes[4] = { IB_SMI_CLASS, IB_SMI_DIRECT_CLASS, IB_SA_CLASS,
IB_PERFORMANCE_CLASS
};
@@ -193,29 +198,38 @@ init_ib_port( ib_port * portdata )
srcport = mad_rpc_open_port( ca, portdata->port_number, mgmt_classes, 4 );
if ( !srcport ) {
- fprintf( stderr, "Failed to open '%s' port '%d'\n", ca,
- portdata->port_number );
- exit( 1 );
+ fprintf( stderr, "%s[%s](%d):: Failed to open '%s' port '%d'\n",
+ __FILE__, __FUNCTION__, __LINE__,
+ ca, portdata->port_number );
+ portdata->is_initialized = -1;
+ return 1;
}
if ( ib_resolve_self_via( &portid, &ibportnum, 0, srcport ) < 0 ) {
- fprintf( stderr, "can't resolve self port\n" );
- exit( 1 );
+ fprintf( stderr, "%s[%s](%d):: Can't resolve self port\n", __FILE__, __FUNCTION__, __LINE__ );
+ portdata->is_initialized = -1;
+ portid=old_portid;
+ ibportnum=old_ibportnum;
+ return 1;
}
/* PerfMgt ClassPortInfo is a required attribute */
/* might be redundant, could be left out for fast implementation */
if ( !pma_query_via
( pc, &portid, ibportnum, ib_timeout, CLASS_PORT_INFO, srcport ) ) {
- fprintf( stderr, "classportinfo query\n" );
- exit( 1 );
+ fprintf( stderr, "%s[%s](%d):: classportinfo query failed\n", __FILE__, __FUNCTION__, __LINE__ );
+ portdata->is_initialized = -1;
+ portid=old_portid;
+ return 1;
}
if ( !performance_reset_via
( pc, &portid, ibportnum, mask, ib_timeout, IB_GSI_PORT_COUNTERS,
srcport ) ) {
- fprintf( stderr, "perf reset\n" );
- exit( 1 );
+ fprintf( stderr, "%s[%s](%d)::performance_reset_via failed\n", __FILE__, __FUNCTION__, __LINE__);
+ portdata->is_initialized = -1;
+ portid=old_portid;
+ return 1;
}
/* read the initial values */
@@ -226,6 +240,7 @@ init_ib_port( ib_port * portdata )
portdata->is_initialized = 1;
+ fprintf( stderr, "%s[%s](%d)::port %s is initialized \n", __FILE__, __FUNCTION__, __LINE__, ca);
return 0;
}
@@ -372,18 +387,23 @@ host_subscribe( const char *cntr )
strncpy( tmp_name, cntr, len - 5 );
tmp_name[len - 5] = 0;
aktp = root_ib_port;
- // printf("looking for IB port '%s'\n", tmp_name);
+#ifdef VERBOSE
+ fprintf(stderr, "%s:: looking for %s at IB port '%s'\n",
+ __FUNCTION__, cntr, tmp_name );
+#endif
while ( aktp != NULL ) {
if ( strcmp( aktp->name, tmp_name ) == 0 ) {
- if ( !aktp->is_initialized ) {
+ if ( aktp->is_initialized == 0 ) {
init_ib_port( aktp );
+ }
+ if ( aktp->is_initialized > 0 ) {
active_ib_port = aktp;
+ return loop + 1;
}
- return loop + 1;
}
/* name does not match, if this counter is
initialized, we can't have two active IB ports */
- if ( aktp->is_initialized ) {
+ if ( aktp->is_initialized > 0) {
#if 0 /* not necessary with OFED version >= 1.4 */
fprintf( stderr,
"unable to activate IB port monitoring for more than one port\n" );
@@ -513,8 +533,12 @@ INFINIBAND_init_thread( hwd_context_t *
counter_list = host_listCounter( num_counters );
- for ( i = 0; i < counter_list->count; i++ )
+ for ( i = 0; i < counter_list->count; i++ ) {
+#ifdef VERBOSE
+ fprintf(stderr,"%s(%d):: Subscribing to counter list entry %3d %s\n",__FUNCTION__,__LINE__,i, counter_list->data[i] );
+#endif
host_subscribe( counter_list->data[i] );
+ }
( ( INFINIBAND_context_t * ) ctx )->state.ncounter = counter_list->count;
@@ -529,7 +553,7 @@ INFINIBAND_init_thread( hwd_context_t *
* PAPI process is initialized (IE PAPI_library_init)
*/
int
-INFINIBAND_init_component( )
+INFINIBAND_init_component( int cidx )
{
int i;
Only in rapl_net_infiniband_mx/src/components/lmsensors: config.log
diff -rup ../papi-5.1.0/src/components/mx/linux-mx.c rapl_net_infiniband_mx/src/components/mx/linux-mx.c
--- ../papi-5.1.0/src/components/mx/linux-mx.c 2013-01-15 15:44:39.000000000 -0500
+++ rapl_net_infiniband_mx/src/components/mx/linux-mx.c 2013-03-08 05:18:07.133219079 -0500
@@ -220,7 +220,7 @@ read_mx_counters( long long *counters )
* PAPI process is initialized (IE PAPI_library_init)
*/
int
-_mx_init_component( )
+_mx_init_component(int cidx )
{
FILE *fff;
Cheers,
/Nils