! / 1250207511 0 0 0 1168 ` <BBBBBBBBBBBMMMMMMMMMzzzzzzzzzz&&&&&&&&&minuminmaxumax_Z21sinplace_T_odd__entryPfii_Z22sinplace_T_even__entryPfii__device_stub__Z14sinplace_T_oddPfii__device_stub__Z15sinplace_T_evenPfiirsqrtrsqrtfmagmablas_sinplace_transposeminuminmaxumax_Z20stranspose_32__entryPfiS_i__device_stub__Z13stranspose_32PfiS_irsqrtrsqrtfmagmablas_stransposeminuminmaxumax_Z16myslaswp___entry15slaswp_params_t__device_stub__Z9myslaswp_15slaswp_params_trsqrtrsqrtfslaswp2magmablas_spermute_longminuminmaxumax_Z21dinplace_T_odd__entryPdii_Z22dinplace_T_even__entryPdii__device_stub__Z14dinplace_T_oddPdii__device_stub__Z15dinplace_T_evenPdiirsqrtrsqrtfmagmablas_dinplace_transposeminuminmaxumax_Z20dtranspose_32__entryPdiS_i__device_stub__Z13dtranspose_32PdiS_irsqrtrsqrtfmagmablas_dtransposeminuminmaxumax_Z16mydlaswp___entry15dlaswp_params_t__device_stub__Z9mydlaswp_15dlaswp_params_trsqrtrsqrtfdlaswp2magmablas_dpermute_long// 50 ` sinplace_transpose.cu_o/ dinplace_transpose.cu_o/ /0 1250207506 500 500 100644 18496 ` ELF;4(UUE 9~]Ð&'UUE 9v]Ð&'UUE 9}]Ð&'UUE 9s]Ð&'U($$PD$$D$ D$$D$D$D$D$ D$D$D$$D$ D$D$D$D$D$ D$D$$ÐU$Ðv'U(EED$D$$tÍE D$D$$uߍED$D$$u$ÐU(EED$D$$tÍE D$D$$uߍED$D$$u$ÐUEzt$==Í&'UE$]EÐU(EEE EEEEED$D$$tÍED$D$$uߍED$D$$u$Ðt&U(EEE EEEEED$D$$tÍED$D$$uߍED$D$$u$Ðt&U($$PD$$D$ D$$D$D$D$D$ D$D$D$$D$ D$D$D$D$D$ D$D$$ÐUhUuu}} ]E EEtmXỦ]ED$D$D$ D$D$$\$D$]u}]ÍvBẺ]ED$D$D$ D$D$$\$D$uEEu}]D$D$$aED$D$$>ED$D$$$EEu}]D$D$$ED$D$$ED$D$$$jXc`%0_Z14sinplace_T_oddPfii_Z15sinplace_T_evenPfii59be0edf51cf8875sinplace_transpose.cu compute_13sm_13?Z h̎/@V :N .version 1.4 .target sm_13 // compiled with /usr/local/cuda-2.2/open64/lib//be // nvopencc built on 2009-04-09 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_000067ed_00000000-7_sinplace_transpose.cpp3.i (/tmp/ccBI#.JJwY0z) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:32 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_000067ed_00000000-6_sinplace_transpose.cudafe2.gpu" .file 2 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 3 "/usr/local/cuda/bin//../include/crt/device_runtime.h" .file 4 "/usr/local/cuda/bin//../include/host_defines.h" .file 5 "/usr/local/cuda/bin//../include/builtin_types.h" .file 6 "/usr/local/cuda/bin//../include/device_types.h" .file 7 "/usr/local/cuda/bin//../include/driver_types.h" .file 8 "/usr/local/cuda/bin//../include/texture_types.h" .file 9 "/usr/local/cuda/bin//../include/vector_types.h" .file 10 "/usr/local/cuda/bin//../include/device_launch_parameters.h" .file 11 "/usr/local/cuda/bin//../include/crt/storage_class.h" .file 12 "/usr/include/bits/types.h" .file 13 "/usr/include/time.h" .file 14 "sinplace_transpose.cu" .file 15 "/usr/local/cuda/bin//../include/common_functions.h" .file 16 "/usr/local/cuda/bin//../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin//../include/math_functions.h" .file 18 "/usr/local/cuda/bin//../include/device_functions.h" .file 19 "/usr/local/cuda/bin//../include/math_constants.h" .file 20 "/usr/local/cuda/bin//../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin//../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin//../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin//../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin//../include/math_functions_dbl_ptx3.h" .entry _Z15sinplace_T_evenPfii ( .param .u32 __cudaparm__Z15sinplace_T_evenPfii_matrix, .param .s32 __cudaparm__Z15sinplace_T_evenPfii_lda, .param .s32 __cudaparm__Z15sinplace_T_evenPfii_half) { .reg .u32 %r<43>; .reg .f32 %f<12>; .reg .pred %p<4>; .shared .align 4 .b8 __cuda_a12[4224]; .shared .align 4 .b8 __cuda_b4236[4224]; .loc 14 9 0 $LBB1__Z15sinplace_T_evenPfii: mov.u32 %r1, __cuda_a12; // .loc 14 24 0 cvt.u32.u16 %r2, %ctaid.y; // cvt.u32.u16 %r3, %ctaid.x; // setp.lt.u32 %p1, %r2, %r3; // ld.param.u32 %r4, [__cudaparm__Z15sinplace_T_evenPfii_half]; // id:91 __cudaparm__Z15sinplace_T_evenPfii_half+0x0 add.u32 %r5, %r4, %r3; // sub.s32 %r6, %r3, 1; // add.u32 %r7, %r4, %r2; // cvt.s32.u16 %r8, %tid.y; // mul24.lo.u32 %r9, %r8, 33; // selp.s32 %r10, %r2, %r5, %p1; // selp.s32 %r11, %r6, %r7, %p1; // cvt.s32.u16 %r12, %tid.x; // add.u32 %r13, %r12, %r9; // mul.lo.s32 %r14, %r10, 32; // mul.lo.s32 %r15, %r11, 32; // mul.lo.u32 %r16, %r13, 4; // add.s32 %r17, %r14, %r8; // add.u32 %r18, %r15, %r12; // add.u32 %r19, %r16, %r1; // ld.param.s32 %r20, [__cudaparm__Z15sinplace_T_evenPfii_lda]; // id:88 __cudaparm__Z15sinplace_T_evenPfii_lda+0x0 mul24.lo.s32 %r21, %r17, %r20; // add.u32 %r22, %r21, %r18; // mul.lo.u32 %r23, %r22, 4; // ld.param.u32 %r24, [__cudaparm__Z15sinplace_T_evenPfii_matrix]; // id:87 __cudaparm__Z15sinplace_T_evenPfii_matrix+0x0 add.u32 %r25, %r23, %r24; // ld.global.f32 %f1, [%r25+0]; // id:94 st.shared.f32 [%r19+0], %f1; // id:95 __cuda_a12+0x0 .loc 14 25 0 mul.lo.u32 %r26, %r20, 64; // add.u32 %r27, %r26, %r25; // ld.global.f32 %f2, [%r27+0]; // id:96 st.shared.f32 [%r19+2112], %f2; // id:97 __cuda_a12+0x0 mul24.lo.u32 %r28, %r12, 33; // add.u32 %r29, %r8, %r28; // mul.lo.u32 %r30, %r29, 4; // add.u32 %r31, %r30, %r1; // setp.ne.s32 %p2, %r14, %r15; // @%p2 bra $Lt_0_2818; // .loc 14 29 0 bar.sync 0; // .loc 14 30 0 ld.shared.f32 %f3, [%r31+0]; // id:98 __cuda_a12+0x0 st.global.f32 [%r25+0], %f3; // id:99 .loc 14 31 0 ld.shared.f32 %f4, [%r31+64]; // id:100 __cuda_a12+0x0 st.global.f32 [%r27+0], %f4; // id:101 bra.uni $Lt_0_2562; // $Lt_0_2818: mov.u32 %r32, __cuda_b4236; // .loc 14 37 0 add.u32 %r33, %r14, %r12; // add.s32 %r34, %r15, %r8; // add.u32 %r35, %r16, %r32; // mul24.lo.s32 %r36, %r34, %r20; // add.u32 %r37, %r36, %r33; // mul.lo.u32 %r38, %r37, 4; // add.u32 %r39, %r38, %r24; // ld.global.f32 %f5, [%r39+0]; // id:102 st.shared.f32 [%r35+0], %f5; // id:103 __cuda_b4236+0x0 .loc 14 38 0 add.u32 %r40, %r26, %r39; // ld.global.f32 %f6, [%r40+0]; // id:104 st.shared.f32 [%r35+2112], %f6; // id:105 __cuda_b4236+0x0 .loc 14 39 0 bar.sync 0; // .loc 14 40 0 add.u32 %r41, %r30, %r32; // ld.shared.f32 %f7, [%r41+0]; // id:106 __cuda_b4236+0x0 st.global.f32 [%r25+0], %f7; // id:107 .loc 14 41 0 ld.shared.f32 %f8, [%r41+64]; // id:108 __cuda_b4236+0x0 st.global.f32 [%r27+0], %f8; // id:109 .loc 14 42 0 ld.shared.f32 %f9, [%r31+0]; // id:110 __cuda_a12+0x0 st.global.f32 [%r39+0], %f9; // id:111 .loc 14 43 0 ld.shared.f32 %f10, [%r31+64]; // id:112 __cuda_a12+0x0 st.global.f32 [%r40+0], %f10; // id:113 $Lt_0_2562: .loc 14 45 0 exit; // $LDWend__Z15sinplace_T_evenPfii: } // _Z15sinplace_T_evenPfii .entry _Z14sinplace_T_oddPfii ( .param .u32 __cudaparm__Z14sinplace_T_oddPfii_matrix, .param .s32 __cudaparm__Z14sinplace_T_oddPfii_lda, .param .s32 __cudaparm__Z14sinplace_T_oddPfii_half) { .reg .u32 %r<43>; .reg .f32 %f<12>; .reg .pred %p<4>; .shared .align 4 .b8 __cuda_a8472[4224]; .shared .align 4 .b8 __cuda_b12696[4224]; .loc 14 47 0 $LBB1__Z14sinplace_T_oddPfii: mov.u32 %r1, __cuda_a8472; // .loc 14 62 0 cvt.u32.u16 %r2, %ctaid.y; // cvt.u32.u16 %r3, %ctaid.x; // setp.le.u32 %p1, %r2, %r3; // ld.param.u32 %r4, [__cudaparm__Z14sinplace_T_oddPfii_half]; // id:91 __cudaparm__Z14sinplace_T_oddPfii_half+0x0 add.u32 %r5, %r4, %r3; // add.u32 %r6, %r4, %r2; // cvt.s32.u16 %r7, %tid.y; // mul24.lo.u32 %r8, %r7, 33; // selp.s32 %r9, %r2, %r5, %p1; // sub.s32 %r10, %r6, 1; // cvt.s32.u16 %r11, %tid.x; // add.u32 %r12, %r11, %r8; // mul.lo.s32 %r13, %r9, 32; // selp.s32 %r14, %r3, %r10, %p1; // mul.lo.u32 %r15, %r12, 4; // add.s32 %r16, %r13, %r7; // mul.lo.s32 %r17, %r14, 32; // add.u32 %r18, %r15, %r1; // ld.param.s32 %r19, [__cudaparm__Z14sinplace_T_oddPfii_lda]; // id:88 __cudaparm__Z14sinplace_T_oddPfii_lda+0x0 mul24.lo.s32 %r20, %r16, %r19; // add.u32 %r21, %r17, %r11; // add.u32 %r22, %r20, %r21; // mul.lo.u32 %r23, %r22, 4; // ld.param.u32 %r24, [__cudaparm__Z14sinplace_T_oddPfii_matrix]; // id:87 __cudaparm__Z14sinplace_T_oddPfii_matrix+0x0 add.u32 %r25, %r23, %r24; // ld.global.f32 %f1, [%r25+0]; // id:94 st.shared.f32 [%r18+0], %f1; // id:95 __cuda_a8472+0x0 .loc 14 63 0 mul.lo.u32 %r26, %r19, 64; // add.u32 %r27, %r26, %r25; // ld.global.f32 %f2, [%r27+0]; // id:96 st.shared.f32 [%r18+2112], %f2; // id:97 __cuda_a8472+0x0 mul24.lo.u32 %r28, %r11, 33; // add.u32 %r29, %r7, %r28; // mul.lo.u32 %r30, %r29, 4; // add.u32 %r31, %r30, %r1; // setp.ne.s32 %p2, %r13, %r17; // @%p2 bra $Lt_1_2818; // .loc 14 67 0 bar.sync 0; // .loc 14 68 0 ld.shared.f32 %f3, [%r31+0]; // id:98 __cuda_a8472+0x0 st.global.f32 [%r25+0], %f3; // id:99 .loc 14 69 0 ld.shared.f32 %f4, [%r31+64]; // id:100 __cuda_a8472+0x0 st.global.f32 [%r27+0], %f4; // id:101 bra.uni $Lt_1_2562; // $Lt_1_2818: mov.u32 %r32, __cuda_b12696; // .loc 14 75 0 add.u32 %r33, %r13, %r11; // add.u32 %r34, %r15, %r32; // add.s32 %r35, %r17, %r7; // mul24.lo.s32 %r36, %r35, %r19; // add.u32 %r37, %r36, %r33; // mul.lo.u32 %r38, %r37, 4; // add.u32 %r39, %r38, %r24; // ld.global.f32 %f5, [%r39+0]; // id:102 st.shared.f32 [%r34+0], %f5; // id:103 __cuda_b12696+0x0 .loc 14 76 0 add.u32 %r40, %r26, %r39; // ld.global.f32 %f6, [%r40+0]; // id:104 st.shared.f32 [%r34+2112], %f6; // id:105 __cuda_b12696+0x0 .loc 14 77 0 bar.sync 0; // .loc 14 78 0 add.u32 %r41, %r30, %r32; // ld.shared.f32 %f7, [%r41+0]; // id:106 __cuda_b12696+0x0 st.global.f32 [%r25+0], %f7; // id:107 .loc 14 79 0 ld.shared.f32 %f8, [%r41+64]; // id:108 __cuda_b12696+0x0 st.global.f32 [%r27+0], %f8; // id:109 .loc 14 80 0 ld.shared.f32 %f9, [%r31+0]; // id:110 __cuda_a8472+0x0 st.global.f32 [%r39+0], %f9; // id:111 .loc 14 81 0 ld.shared.f32 %f10, [%r31+64]; // id:112 __cuda_a8472+0x0 st.global.f32 [%r40+0], %f10; // id:113 $Lt_1_2562: .loc 14 83 0 exit; // $LDWend__Z14sinplace_T_oddPfii: } // _Z14sinplace_T_oddPfii architecture {sm_13} abiversion {1} modname {cubin} code { name = _Z15sinplace_T_evenPfii lmem = 0 smem = 8476 reg = 9 bar = 1 const { segname = const segnum = 1 offset = 0 bytes = 8 mem { 0x000003ff 0x00000021 } } bincode { 0x10000005 0x0403c780 0xa0004e11 0x04200780 0xa0004c01 0x04200780 0x300009fd 0x640047c8 0x2000cc0d 0x04200780 0x203f8009 0x0fffffff 0xd0800601 0x00400780 0x1000080d 0x0403c280 0x2000cc09 0x04210500 0xa0000001 0x04000780 0x30050611 0xc4100780 0xa0000405 0x04000780 0x3005041d 0xc4100780 0x20048008 0x2007820c 0x4142ea08 0x20038408 0x30020415 0xc4100780 0x3006ca19 0xc4300780 0x6081020d 0x60400780 0x60810009 0x60404780 0x2000c815 0x04214780 0x00020605 0xc0000780 0x00020409 0xc0000780 0x20000a09 0x04018780 0xd00e0a21 0x80c00780 0xd00e040d 0x80c00780 0x08000e01 0xe4220780 0x300709fd 0x6c0147c8 0x08042e01 0xe420c780 0x10023003 0x00000280 0x861ffe03 0x00000000 0x14000e01 0x4400c780 0xd00e0a01 0xa0c00780 0x14002e01 0x4400c780 0xd00e0401 0xa0c00780 0x30000003 0x00000780 0x20008e0c 0x20018800 0x4143ea04 0x20008200 0x30020001 0xc4100780 0x2100e804 0x2006820c 0xd00e0211 0x80c00780 0xd00e0601 0x80c00780 0x08084e01 0xe4210780 0x080c6e01 0xe4200780 0x861ffe03 0x00000000 0xd4213809 0x20000780 0x18000001 0x4400c780 0xd00e0a01 0xa0c00780 0x18002001 0x4400c780 0xd00e0401 0xa0c00780 0x14000e01 0x4400c780 0xd00e0201 0xa0c00780 0x14002e01 0x4400c780 0xd00e0601 0xa0c00781 } } code { name = _Z14sinplace_T_oddPfii lmem = 0 smem = 8476 reg = 9 bar = 1 const { segname = const segnum = 1 offset = 0 bytes = 8 mem { 0x000003ff 0x00000021 } } bincode { 0x10000005 0x0403c780 0xa0004e09 0x04200780 0xa0004c15 0x04200780 0x300505fd 0x6400c7c8 0x2105ec0c 0x2102ec10 0xd0800601 0x00400780 0x1000040d 0x0403c280 0x203f8809 0x0fffffff 0xa0000001 0x04000780 0x30050611 0xc4100780 0x10000a09 0x0403c280 0xa0000405 0x04000780 0x2000000d 0x04010780 0x3005041d 0xc4100780 0x4143ea08 0x2007820c 0x20000409 0x0400c780 0x30020415 0xc4100780 0x3006ca19 0xc4300780 0x6081020d 0x60400780 0x60810009 0x60404780 0x2000c815 0x04214780 0x00020605 0xc0000780 0x00020409 0xc0000780 0x20000a09 0x04018780 0xd00e0a21 0x80c00780 0xd00e040d 0x80c00780 0x08000e01 0xe4220780 0x300709fd 0x6c0147c8 0x08042e01 0xe420c780 0x10024003 0x00000280 0x861ffe03 0x00000000 0x14000e01 0x4400c780 0xd00e0a01 0xa0c00780 0x14002e01 0x4400c780 0xd00e0401 0xa0c00780 0x30000003 0x00000780 0x20008e0c 0x20018800 0x4143ea04 0x20008200 0x30020001 0xc4100780 0x2100e804 0x2006820c 0xd00e0211 0x80c00780 0xd00e0601 0x80c00780 0x08084e01 0xe4210780 0x080c6e01 0xe4200780 0x861ffe03 0x00000000 0xd4213809 0x20000780 0x18000001 0x4400c780 0xd00e0a01 0xa0c00780 0x18002001 0x4400c780 0xd00e0401 0xa0c00780 0x14000e01 0x4400c780 0xd00e0201 0xa0c00780 0x14002e01 0x4400c780 0xd00e0601 0xa0c00781 } } zP|  AB < AB X@AB t`AB AB PAB pAB AB 2AB AB 8AB TAB p0AB  6AB LFFGCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-44).symtab.strtab.shstrtab.rel.text.rel.data.bss.rel.ctors.rodata.str1.1.rodata.cst8.rel.nvFatBinSegment.rodata.rel.eh_frame.comment.note.GNU-stack@6 D)x % pG /84 G?2iN_ < [ G0 p`1 |x9x Gx $;.R;R;? A 30[< P%  Ubq X% `%    " "@"`"*Icp2" "6tmpxft_000067ed_00000000-11_sinplace_transpose.ii_GLOBAL__I_magmablas_sinplace_transpose__sti____cudaRegisterAll_53_tmpxft_000067ed_00000000_4_sinplace_transpose_cpp1_ii_c20e9100__fatDeviceText__cudaUnregisterBinaryUtil__cudaFatCubinHandle_ZZ36__device_stub__Z14sinplace_T_oddPfiiE3__f_ZZ37__device_stub__Z15sinplace_T_evenPfiiE3__f__ptxEntries__cubinEntries__debugEntries__deviceText_$compute_13$__deviceText_$sm_13$min__gxx_personality_v0uminmaxumax__cudaRegisterFatBinaryatexit_Z21sinplace_T_odd__entryPfii__cudaRegisterFunction_Z22sinplace_T_even__entryPfii__cudaUnregisterFatBinary__device_stub__Z14sinplace_T_oddPfiicudaSetupArgumentcudaLaunch__device_stub__Z15sinplace_T_evenPfiirsqrtsqrtrsqrtfmagmablas_sinplace_transposecudaConfigureCall !19A"I!W_#%%%  &)%J%i%sw"~"&)(+%L%k%uy""&%% %  %&9>EJOT !"!,,3%V%y%""&%%% $( -&  $@\x <Xtstranspose.cu_o/1250207507 500 500 100644 11300 ` ELF"4(UUE 9~]Ð&'UUE 9v]Ð&'UUE 9}]Ð&'UUE 9s]Ð&'U($$D$$D$ D$D$D$D$D$ D$D$$ÍU$Ðv'U(EED$D$$tÍE D$D$$uߍED$D$$uED$ D$$u$Ð&U]UEzt$==Í&'UE$]EÐUXEM]]uuEЋE}} E E‰EU܉EED$D$D$ D$D$$D$D$t]u}]ÍvEЉ]]uu} }E]U($$D$$D$ D$D$D$D$D$ D$D$$9D _Z13stranspose_32PfiS_iecef275728201826stranspose.cu compute_13sm_13?Z h̎)7ep .version 1.4 .target sm_13 // compiled with /usr/local/cuda-2.2/open64/lib//be // nvopencc built on 2009-04-09 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_00006819_00000000-7_stranspose.cpp3.i (/tmp/ccBI#.k9v7iK) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:32 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_00006819_00000000-6_stranspose.cudafe2.gpu" .file 2 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 3 "/usr/local/cuda/bin//../include/crt/device_runtime.h" .file 4 "/usr/local/cuda/bin//../include/host_defines.h" .file 5 "/usr/local/cuda/bin//../include/builtin_types.h" .file 6 "/usr/local/cuda/bin//../include/device_types.h" .file 7 "/usr/local/cuda/bin//../include/driver_types.h" .file 8 "/usr/local/cuda/bin//../include/texture_types.h" .file 9 "/usr/local/cuda/bin//../include/vector_types.h" .file 10 "/usr/local/cuda/bin//../include/device_launch_parameters.h" .file 11 "/usr/local/cuda/bin//../include/crt/storage_class.h" .file 12 "/usr/include/bits/types.h" .file 13 "/usr/include/time.h" .file 14 "stranspose.cu" .file 15 "/usr/local/cuda/bin//../include/common_functions.h" .file 16 "/usr/local/cuda/bin//../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin//../include/math_functions.h" .file 18 "/usr/local/cuda/bin//../include/device_functions.h" .file 19 "/usr/local/cuda/bin//../include/math_constants.h" .file 20 "/usr/local/cuda/bin//../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin//../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin//../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin//../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin//../include/math_functions_dbl_ptx3.h" .entry _Z13stranspose_32PfiS_i ( .param .u32 __cudaparm__Z13stranspose_32PfiS_i___val_paramB, .param .s32 __cudaparm__Z13stranspose_32PfiS_i_ldb, .param .u32 __cudaparm__Z13stranspose_32PfiS_i___val_paramA, .param .s32 __cudaparm__Z13stranspose_32PfiS_i_lda) { .reg .u16 %rh<4>; .reg .u32 %r<43>; .reg .f32 %f<10>; .shared .align 4 .b8 __cuda_a16[4224]; .loc 14 9 0 $LBB1__Z13stranspose_32PfiS_i: mov.u32 %r1, __cuda_a16; // .loc 14 21 0 mov.u16 %rh1, %ctaid.x; // mul.wide.u16 %r2, %rh1, 32; // mov.u16 %rh2, %ctaid.y; // mul.wide.u16 %r3, %rh2, 32; // cvt.s32.u16 %r4, %tid.y; // mul24.lo.u32 %r5, %r4, 33; // cvt.s32.u16 %r6, %tid.x; // add.s32 %r7, %r6, %r2; // add.s32 %r8, %r4, %r3; // add.u32 %r9, %r6, %r5; // ld.param.s32 %r10, [__cudaparm__Z13stranspose_32PfiS_i_lda]; // id:68 __cudaparm__Z13stranspose_32PfiS_i_lda+0x0 mul24.lo.s32 %r11, %r8, %r10; // mul.lo.u32 %r12, %r9, 4; // add.s32 %r13, %r7, %r11; // add.u32 %r14, %r12, %r1; // mul.lo.u32 %r15, %r13, 4; // ld.param.u32 %r16, [__cudaparm__Z13stranspose_32PfiS_i___val_paramA]; // id:73 __cudaparm__Z13stranspose_32PfiS_i___val_paramA+0x0 add.u32 %r17, %r16, %r15; // ld.global.f32 %f1, [%r17+0]; // id:74 st.shared.f32 [%r14+0], %f1; // id:75 __cuda_a16+0x0 .loc 14 22 0 mul.lo.u32 %r18, %r10, 32; // add.u32 %r19, %r17, %r18; // ld.global.f32 %f2, [%r19+0]; // id:76 st.shared.f32 [%r14+1056], %f2; // id:77 __cuda_a16+0x0 .loc 14 23 0 mul.lo.u32 %r20, %r10, 64; // add.u32 %r21, %r17, %r20; // ld.global.f32 %f3, [%r21+0]; // id:78 st.shared.f32 [%r14+2112], %f3; // id:79 __cuda_a16+0x0 .loc 14 24 0 mul.lo.u32 %r22, %r10, 96; // add.u32 %r23, %r17, %r22; // ld.global.f32 %f4, [%r23+0]; // id:80 st.shared.f32 [%r14+3168], %f4; // id:81 __cuda_a16+0x0 .loc 14 26 0 bar.sync 0; // .loc 14 28 0 mul24.lo.u32 %r24, %r6, 33; // add.s32 %r25, %r4, %r2; // add.s32 %r26, %r6, %r3; // add.u32 %r27, %r4, %r24; // ld.param.s32 %r28, [__cudaparm__Z13stranspose_32PfiS_i_ldb]; // id:67 __cudaparm__Z13stranspose_32PfiS_i_ldb+0x0 mul24.lo.s32 %r29, %r25, %r28; // mul.lo.u32 %r30, %r27, 4; // add.s32 %r31, %r26, %r29; // add.u32 %r32, %r30, %r1; // mul.lo.u32 %r33, %r31, 4; // ld.param.u32 %r34, [__cudaparm__Z13stranspose_32PfiS_i___val_paramB]; // id:82 __cudaparm__Z13stranspose_32PfiS_i___val_paramB+0x0 add.u32 %r35, %r34, %r33; // ld.shared.f32 %f5, [%r32+0]; // id:83 __cuda_a16+0x0 st.global.f32 [%r35+0], %f5; // id:84 .loc 14 29 0 ld.shared.f32 %f6, [%r32+32]; // id:85 __cuda_a16+0x0 mul.lo.u32 %r36, %r28, 32; // add.u32 %r37, %r35, %r36; // st.global.f32 [%r37+0], %f6; // id:86 .loc 14 30 0 ld.shared.f32 %f7, [%r32+64]; // id:87 __cuda_a16+0x0 mul.lo.u32 %r38, %r28, 64; // add.u32 %r39, %r35, %r38; // st.global.f32 [%r39+0], %f7; // id:88 .loc 14 31 0 ld.shared.f32 %f8, [%r32+96]; // id:89 __cuda_a16+0x0 mul.lo.u32 %r40, %r28, 96; // add.u32 %r41, %r35, %r40; // st.global.f32 [%r41+0], %f8; // id:90 .loc 14 32 0 exit; // $LDWend__Z13stranspose_32PfiS_i: } // _Z13stranspose_32PfiS_i architecture {sm_13} abiversion {1} modname {cubin} code { name = _Z13stranspose_32PfiS_i lmem = 0 smem = 4256 reg = 10 bar = 1 const { segname = const segnum = 1 offset = 0 bytes = 8 mem { 0x000003ff 0x00000021 } } bincode { 0xd0800205 0x00400780 0x41202c0d 0x00000003 0x41202e11 0x00000003 0xa0000005 0x04000780 0xa0000201 0x04000780 0x20018614 0x20008808 0x6002ce09 0x80214780 0x30020415 0xc4100780 0x3005ce19 0xc4300780 0x3006ce1d 0xc4300780 0x60810009 0x60404780 0x2105ec14 0x20078c24 0x00020405 0xc0000780 0x20068a08 0x20078a20 0x20000a25 0x04024780 0xd00e0a1d 0x80c00780 0xd00e0419 0x80c00780 0xd00e1015 0x80c00780 0xd00e1209 0x80c00780 0x04001001 0xe421c780 0x04022001 0xe4218780 0x04043001 0xe4214780 0x04064001 0xe4208780 0x861ffe03 0x00000000 0x20038008 0x2004820c 0x60810201 0x60400780 0x6002ca05 0x8020c780 0x00020005 0xc0000780 0x30020205 0xc4100780 0x1500f000 0x2101e804 0x3005ca09 0xc4300780 0xd00e0201 0xa0c00780 0x14002001 0x4400c780 0x2000020d 0x04008780 0x3006ca11 0xc4300780 0xd00e0601 0xa0c00780 0x2000040d 0x04010780 0x14003001 0x4400c780 0x20000209 0x04010780 0xd4010005 0x20000780 0x20000205 0x0400c780 0xd00e0401 0xa0c00780 0x14000001 0x4400c780 0xd00e0201 0xa0c00781 } } zP|  AB < AB X@AB t`AB zAB AB  AB 2AB  AB  @AB LFL@zAB GCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-44).symtab.strtab.shstrtab.rel.text.rel.data.bss.rel.ctors.rodata.str1.1.rodata.cst8.rel.nvFatBinSegment.rodata.rel.eh_frame.comment.note.GNU-stack@ D*() % l+ /84 +?2JN@_`< [ +0 p@ |Xx +` 8!.f!f!(% 'z+zKz<   *9 S     h"l "@"`"  $6A2"G L "S@htmpxft_00006819_00000000-11_stranspose.ii_GLOBAL__I_magmablas_stranspose__sti____cudaRegisterAll_45_tmpxft_00006819_00000000_4_stranspose_cpp1_ii_f76129e7__fatDeviceText__cudaUnregisterBinaryUtil__cudaFatCubinHandle_ZZ37__device_stub__Z13stranspose_32PfiS_iE3__f__ptxEntries__cubinEntries__debugEntries__deviceText_$compute_13$__deviceText_$sm_13$min__gxx_personality_v0uminmaxumax__cudaRegisterFatBinaryatexit_Z20stranspose_32__entryPfiS_i__cudaRegisterFunction__cudaUnregisterFatBinary__device_stub__Z13stranspose_32PfiS_icudaSetupArgumentcudaLaunchrsqrtsqrtrsqrtfmagmablas_stransposecudaConfigureCall !I#j###$&  -%)%*/4lt| " "  $@\x Dspermute.cu_o/ 1250207508 500 500 100644 11424 ` ELF!4(UUE 9~]Ð&'UUE 9v]Ð&'UUE 9}]Ð&'UUE 9s]Ð&'U($$D$$D$ D$D$D$D$D$ D$D$$ÍU$Ðv'U(ED$D$E$t$Ðt&UEzt$==Í&'UE$]EÐU($$D$$D$ D$D$D$D$D$ D$D$$ÍU(EED$D$$t$Ðt&UVSuVEEE@?EEED$D$D$ @D$D$$D$D$t [^]ÍLD$t$$LED$D$$u$[^]ÍUWVS|EE HDž?‹EUD$1D$<$Ef)fD_U@uዕEEE@UEED$D$D$ @D$D$$D$D$t+@9U|[^_]ËU D$|$TEPL$HED$D$$h$M=H`_Z9myslaswp_15slaswp_params_tc9c4a42363d47ef7spermute.cu compute_13sm_13?Z h̎/; Z .version 1.4 .target sm_13 // compiled with /usr/local/cuda-2.2/open64/lib//be // nvopencc built on 2009-04-09 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_00006845_00000000-7_spermute.cpp3.i (/tmp/ccBI#.7uRNvF) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:32 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_00006845_00000000-6_spermute.cudafe2.gpu" .file 2 "spermute.cu" .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 4 "/usr/local/cuda/bin//../include/crt/device_runtime.h" .file 5 "/usr/local/cuda/bin//../include/host_defines.h" .file 6 "/usr/local/cuda/bin//../include/builtin_types.h" .file 7 "/usr/local/cuda/bin//../include/device_types.h" .file 8 "/usr/local/cuda/bin//../include/driver_types.h" .file 9 "/usr/local/cuda/bin//../include/texture_types.h" .file 10 "/usr/local/cuda/bin//../include/vector_types.h" .file 11 "/usr/local/cuda/bin//../include/device_launch_parameters.h" .file 12 "/usr/local/cuda/bin//../include/crt/storage_class.h" .file 13 "/usr/include/bits/types.h" .file 14 "/usr/include/time.h" .file 15 "/usr/local/cuda/bin//../include/common_functions.h" .file 16 "/usr/local/cuda/bin//../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin//../include/math_functions.h" .file 18 "/usr/local/cuda/bin//../include/device_functions.h" .file 19 "/usr/local/cuda/bin//../include/math_constants.h" .file 20 "/usr/local/cuda/bin//../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin//../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin//../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin//../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin//../include/math_functions_dbl_ptx3.h" .entry _Z9myslaswp_15slaswp_params_t ( .param .align 4 .b8 __cudaparm__Z9myslaswp_15slaswp_params_t___val_paramparams[144]) { .reg .u32 %r<30>; .reg .f32 %f<4>; .reg .pred %p<5>; .local .align 4 .b8 __cuda___cuda_params_16144[144]; .loc 2 17 0 $LBB1__Z9myslaswp_15slaswp_params_t: mov.u32 %r1, __cudaparm__Z9myslaswp_15slaswp_params_t___val_paramparams; // mov.u32 %r2, __cuda___cuda_params_16144; // mov.s32 %r3, %r1; // mov.s32 %r4, %r2; // mov.s32 %r5, 36; // $L_0_3586: // Loop body line 17, nesting depth: 1, iterations: 36 ld.param.u32 %r6, [%r3+0]; // id:56 __cudaparm__Z9myslaswp_15slaswp_params_t___val_paramparams+0x0 st.local.u32 [%r4+0], %r6; // id:55 __cuda___cuda_params_16144+0x0 add.u32 %r3, %r3, 4; // add.u32 %r4, %r4, 4; // sub.s32 %r5, %r5, 1; // mov.u32 %r7, 0; // setp.gt.s32 %p1, %r5, %r7; // @%p1 bra $L_0_3586; // cvt.s32.u16 %r8, %ntid.x; // cvt.s32.u16 %r9, %ctaid.x; // mul24.lo.s32 %r10, %r8, %r9; // cvt.u32.u16 %r11, %tid.x; // add.u32 %r12, %r10, %r11; // ld.local.u32 %r13, [__cuda___cuda_params_16144+4]; // id:47 __cuda___cuda_params_16144+0x4 setp.le.u32 %p2, %r13, %r12; // @%p2 bra $Lt_0_2050; // ld.local.s32 %r14, [__cuda___cuda_params_16144+8]; // id:41 __cuda___cuda_params_16144+0x8 ld.local.s32 %r15, [__cuda___cuda_params_16144+12]; // id:48 __cuda___cuda_params_16144+0xc mul.lo.s32 %r16, %r15, %r14; // mul.lo.u32 %r17, %r14, 4; // mov.u32 %r18, __cuda___cuda_params_16144; // mov.u32 %r19, __cuda___cuda_params_16144+128; // add.u32 %r20, %r16, %r12; // mul.lo.u32 %r21, %r20, 4; // ld.local.u32 %r22, [__cuda___cuda_params_16144+0]; // id:49 __cuda___cuda_params_16144+0x0 add.s32 %r23, %r22, %r21; // mov.s32 %r24, %r23; // $Lt_0_3074: // Loop body line 17, nesting depth: 1, iterations: 64 .loc 2 30 0 ld.global.f32 %f1, [%r24+0]; // id:50 .loc 2 31 0 ld.local.s16 %r25, [%r18+16]; // id:51 __cuda___cuda_params_16144+0x0 mul.lo.s32 %r26, %r25, %r14; // mul.lo.u32 %r27, %r26, 4; // add.u32 %r28, %r23, %r27; // ld.global.f32 %f2, [%r28+0]; // id:52 st.global.f32 [%r24+0], %f2; // id:53 .loc 2 32 0 st.global.f32 [%r28+0], %f1; // id:54 add.u32 %r18, %r18, 2; // add.s32 %r24, %r24, %r17; // setp.ne.u32 %p3, %r18, %r19; // @%p3 bra $Lt_0_3074; // $Lt_0_2050: .loc 2 35 0 exit; // $LDWend__Z9myslaswp_15slaswp_params_t: } // _Z9myslaswp_15slaswp_params_t architecture {sm_13} abiversion {1} modname {cubin} code { name = _Z9myslaswp_15slaswp_params_t lmem = 144 smem = 160 reg = 8 bar = 0 const { segname = const segnum = 1 offset = 0 bytes = 4 mem { 0x00000080 } } bincode { 0x10108005 0x00000003 0x00000205 0xc0000780 0x0000f809 0xc0000780 0x10248009 0x00000003 0x16000205 0x4400c780 0x203f8409 0x0fffffff 0xd8000005 0x60c00780 0x307c05fd 0x6c0107c8 0xd8000809 0x20000780 0x10004003 0x00000280 0xa0004205 0x04200780 0xa0004c09 0x04200780 0x40020205 0x00018780 0xa0000001 0x04000780 0x20000201 0x04000780 0xd0000805 0x40c00780 0x300003fd 0x6400c7c8 0x30000003 0x00000280 0xd0001009 0x40c00780 0xd000180d 0x40c00780 0xd0000005 0x40c00780 0x40050c11 0x00000780 0x60040e11 0x00010780 0x30100811 0xc4100780 0x60040c0d 0x00010780 0x20000601 0x04000780 0x30020001 0xc4100780 0x20000205 0x04000780 0x1000f811 0x0603c780 0x3002040d 0xc4100780 0x10000201 0x0403c780 0x00000805 0xc0000780 0xd4002015 0x40600780 0x40051419 0x00000780 0x60041619 0x00018780 0x30100c19 0xc4100780 0x60041415 0x00018780 0x30020a15 0xc4100780 0x2000001d 0x04014780 0xd00e0219 0x80c00780 0xd00e0e15 0x80c00780 0x20028811 0x00000003 0xd00e0215 0xa0c00780 0x308009fd 0x644147c8 0xd00e0e19 0xa0c00780 0x20000205 0x0400c780 0x1001f003 0x00000280 0xf0000001 0xe0000001 } } zP|  AB < AB X@AB t`AB zAB AB  KAB p2AB AB zAB 8PKAB TAB HtAB IGCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-44).symtab.strtab.shstrtab.rel.text.rel.data.bss.rel.ctors.rodata.str1.1.rodata.cst8.rel.nvFatBinSegment.rodata.rel.eh_frame.comment.note.GNU-stack@g @*) % + /84 +?2NN _@< [ ,0 p |Xx 0,p  .!!$ ')z<z<   .H H`x   ]"av "{@"`"PK K&8Cp2"I N"U]o v tmpxft_00006845_00000000-11_spermute.ii_GLOBAL__I_slaswp2__sti____cudaRegisterAll_43_tmpxft_00006845_00000000_4_spermute_cpp1_ii_4cd28a4f__fatDeviceText__cudaUnregisterBinaryUtil__cudaFatCubinHandle_ZZ43__device_stub__Z9myslaswp_15slaswp_params_tE3__f__ptxEntries__cubinEntries__debugEntries__deviceText_$compute_13$__deviceText_$sm_13$min__gxx_personality_v0uminmaxumax__cudaRegisterFatBinaryatexit_Z16myslaswp___entry15slaswp_params_t__cudaRegisterFunction__cudaUnregisterFatBinary__device_stub__Z9myslaswp_15slaswp_params_tcudaSetupArgumentcudaLaunchrsqrtsqrtrsqrtfslaswp2cudaConfigureCallmemcpymagmablas_spermute_longmemset !I#UY`e$&  %,4<D y#$#)K*l#w}$,)*@#OUY^$  $@\x <Xx/25 1250207509 500 500 100644 16624 ` ELF44(UUE 9~]Ð&'UUE 9v]Ð&'UUE 9}]Ð&'UUE 9s]Ð&'U($$PD$$D$ D$$D$D$D$D$ D$D$D$$D$ D$D$D$D$D$ D$D$$ÐU$Ðv'U(EED$D$$tÍE D$D$$uߍED$D$$u$ÐU(EED$D$$tÍE D$D$$uߍED$D$$u$ÐUEzt$==Í&'UE$]EÐU(EEE EEEEED$D$$tÍED$D$$uߍED$D$$u$Ðt&U(EEE EEEEED$D$$tÍED$D$$uߍED$D$$u$Ðt&U($$PD$$D$ D$$D$D$D$D$ D$D$D$$D$ D$D$D$D$D$ D$D$$ÐUhUuu}} ]EEEtmXỦ]ED$D$D$ D$D$$\$D$]u}]ÍvBẺ]ED$D$D$ D$D$$\$D$uEEu}]D$D$$aED$D$$>ED$D$$$EEu}]D$D$$ED$D$$ED$D$$$jXc 0_Z14dinplace_T_oddPdii_Z15dinplace_T_evenPdiif799d168686a7c1edinplace_transpose.cu compute_13sm_13?Z h̎/@V v .version 1.4 .target sm_13 // compiled with /usr/local/cuda-2.2/open64/lib//be // nvopencc built on 2009-04-09 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_00006871_00000000-7_dinplace_transpose.cpp3.i (/tmp/ccBI#.zZf5JF) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:32 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_00006871_00000000-6_dinplace_transpose.cudafe2.gpu" .file 2 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 3 "/usr/local/cuda/bin//../include/crt/device_runtime.h" .file 4 "/usr/local/cuda/bin//../include/host_defines.h" .file 5 "/usr/local/cuda/bin//../include/builtin_types.h" .file 6 "/usr/local/cuda/bin//../include/device_types.h" .file 7 "/usr/local/cuda/bin//../include/driver_types.h" .file 8 "/usr/local/cuda/bin//../include/texture_types.h" .file 9 "/usr/local/cuda/bin//../include/vector_types.h" .file 10 "/usr/local/cuda/bin//../include/device_launch_parameters.h" .file 11 "/usr/local/cuda/bin//../include/crt/storage_class.h" .file 12 "/usr/include/bits/types.h" .file 13 "/usr/include/time.h" .file 14 "dinplace_transpose.cu" .file 15 "/usr/local/cuda/bin//../include/common_functions.h" .file 16 "/usr/local/cuda/bin//../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin//../include/math_functions.h" .file 18 "/usr/local/cuda/bin//../include/device_functions.h" .file 19 "/usr/local/cuda/bin//../include/math_constants.h" .file 20 "/usr/local/cuda/bin//../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin//../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin//../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin//../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin//../include/math_functions_dbl_ptx3.h" .entry _Z15dinplace_T_evenPdii ( .param .u32 __cudaparm__Z15dinplace_T_evenPdii_matrix, .param .s32 __cudaparm__Z15dinplace_T_evenPdii_lda, .param .s32 __cudaparm__Z15dinplace_T_evenPdii_half) { .reg .u32 %r<40>; .reg .f64 %fd<7>; .reg .pred %p<4>; .shared .align 8 .b8 __cuda_a16[2176]; .shared .align 8 .b8 __cuda_b2192[2176]; .loc 14 9 0 $LBB1__Z15dinplace_T_evenPdii: mov.u32 %r1, __cuda_a16; // .loc 14 27 0 cvt.u32.u16 %r2, %ctaid.y; // cvt.u32.u16 %r3, %ctaid.x; // setp.lt.u32 %p1, %r2, %r3; // ld.param.u32 %r4, [__cudaparm__Z15dinplace_T_evenPdii_half]; // id:61 __cudaparm__Z15dinplace_T_evenPdii_half+0x0 add.u32 %r5, %r4, %r3; // sub.s32 %r6, %r3, 1; // add.u32 %r7, %r4, %r2; // cvt.s32.u16 %r8, %tid.y; // mul24.lo.u32 %r9, %r8, 17; // selp.s32 %r10, %r2, %r5, %p1; // selp.s32 %r11, %r6, %r7, %p1; // cvt.s32.u16 %r12, %tid.x; // add.u32 %r13, %r12, %r9; // mul.lo.s32 %r14, %r10, 16; // mul.lo.s32 %r15, %r11, 16; // mul.lo.u32 %r16, %r13, 8; // add.s32 %r17, %r14, %r8; // add.u32 %r18, %r15, %r12; // ld.param.s32 %r19, [__cudaparm__Z15dinplace_T_evenPdii_lda]; // id:58 __cudaparm__Z15dinplace_T_evenPdii_lda+0x0 mul24.lo.s32 %r20, %r17, %r19; // add.u32 %r21, %r20, %r18; // mul.lo.u32 %r22, %r21, 8; // ld.param.u32 %r23, [__cudaparm__Z15dinplace_T_evenPdii_matrix]; // id:57 __cudaparm__Z15dinplace_T_evenPdii_matrix+0x0 add.u32 %r24, %r22, %r23; // ld.global.f64 %fd1, [%r24+0]; // id:64 add.u32 %r25, %r16, %r1; // st.shared.f64 [%r25+0], %fd1; // id:65 __cuda_a16+0x0 mul24.lo.u32 %r26, %r12, 17; // add.u32 %r27, %r8, %r26; // mul.lo.u32 %r28, %r27, 8; // add.u32 %r29, %r28, %r1; // setp.ne.s32 %p2, %r14, %r15; // @%p2 bra $Lt_0_2818; // .loc 14 32 0 bar.sync 0; // .loc 14 33 0 ld.shared.f64 %fd2, [%r29+0]; // id:66 __cuda_a16+0x0 st.global.f64 [%r24+0], %fd2; // id:67 bra.uni $Lt_0_2562; // $Lt_0_2818: mov.u32 %r30, __cuda_b2192; // .loc 14 40 0 add.u32 %r31, %r14, %r12; // add.s32 %r32, %r15, %r8; // mul24.lo.s32 %r33, %r32, %r19; // add.u32 %r34, %r33, %r31; // mul.lo.u32 %r35, %r34, 8; // add.u32 %r36, %r35, %r23; // ld.global.f64 %fd3, [%r36+0]; // id:68 add.u32 %r37, %r16, %r30; // st.shared.f64 [%r37+0], %fd3; // id:69 __cuda_b2192+0x0 .loc 14 42 0 bar.sync 0; // .loc 14 43 0 add.u32 %r38, %r28, %r30; // ld.shared.f64 %fd4, [%r38+0]; // id:70 __cuda_b2192+0x0 st.global.f64 [%r24+0], %fd4; // id:71 .loc 14 45 0 ld.shared.f64 %fd5, [%r29+0]; // id:72 __cuda_a16+0x0 st.global.f64 [%r36+0], %fd5; // id:73 $Lt_0_2562: .loc 14 48 0 exit; // $LDWend__Z15dinplace_T_evenPdii: } // _Z15dinplace_T_evenPdii .entry _Z14dinplace_T_oddPdii ( .param .u32 __cudaparm__Z14dinplace_T_oddPdii_matrix, .param .s32 __cudaparm__Z14dinplace_T_oddPdii_lda, .param .s32 __cudaparm__Z14dinplace_T_oddPdii_half) { .reg .u32 %r<40>; .reg .f64 %fd<7>; .reg .pred %p<4>; .shared .align 8 .b8 __cuda_a4384[2176]; .shared .align 8 .b8 __cuda_b6560[2176]; .loc 14 50 0 $LBB1__Z14dinplace_T_oddPdii: mov.u32 %r1, __cuda_a4384; // .loc 14 68 0 cvt.u32.u16 %r2, %ctaid.y; // cvt.u32.u16 %r3, %ctaid.x; // setp.le.u32 %p1, %r2, %r3; // ld.param.u32 %r4, [__cudaparm__Z14dinplace_T_oddPdii_half]; // id:61 __cudaparm__Z14dinplace_T_oddPdii_half+0x0 add.u32 %r5, %r4, %r3; // add.u32 %r6, %r4, %r2; // cvt.s32.u16 %r7, %tid.y; // mul24.lo.u32 %r8, %r7, 17; // selp.s32 %r9, %r2, %r5, %p1; // sub.s32 %r10, %r6, 1; // cvt.s32.u16 %r11, %tid.x; // add.u32 %r12, %r11, %r8; // mul.lo.s32 %r13, %r9, 16; // selp.s32 %r14, %r3, %r10, %p1; // mul.lo.u32 %r15, %r12, 8; // add.s32 %r16, %r13, %r7; // mul.lo.s32 %r17, %r14, 16; // ld.param.s32 %r18, [__cudaparm__Z14dinplace_T_oddPdii_lda]; // id:58 __cudaparm__Z14dinplace_T_oddPdii_lda+0x0 mul24.lo.s32 %r19, %r16, %r18; // add.u32 %r20, %r17, %r11; // add.u32 %r21, %r19, %r20; // mul.lo.u32 %r22, %r21, 8; // ld.param.u32 %r23, [__cudaparm__Z14dinplace_T_oddPdii_matrix]; // id:57 __cudaparm__Z14dinplace_T_oddPdii_matrix+0x0 add.u32 %r24, %r22, %r23; // ld.global.f64 %fd1, [%r24+0]; // id:64 add.u32 %r25, %r15, %r1; // st.shared.f64 [%r25+0], %fd1; // id:65 __cuda_a4384+0x0 mul24.lo.u32 %r26, %r11, 17; // add.u32 %r27, %r7, %r26; // mul.lo.u32 %r28, %r27, 8; // add.u32 %r29, %r28, %r1; // setp.ne.s32 %p2, %r13, %r17; // @%p2 bra $Lt_1_2818; // .loc 14 73 0 bar.sync 0; // .loc 14 74 0 ld.shared.f64 %fd2, [%r29+0]; // id:66 __cuda_a4384+0x0 st.global.f64 [%r24+0], %fd2; // id:67 bra.uni $Lt_1_2562; // $Lt_1_2818: mov.u32 %r30, __cuda_b6560; // .loc 14 81 0 add.u32 %r31, %r13, %r11; // add.s32 %r32, %r17, %r7; // mul24.lo.s32 %r33, %r32, %r18; // add.u32 %r34, %r33, %r31; // mul.lo.u32 %r35, %r34, 8; // add.u32 %r36, %r35, %r23; // ld.global.f64 %fd3, [%r36+0]; // id:68 add.u32 %r37, %r15, %r30; // st.shared.f64 [%r37+0], %fd3; // id:69 __cuda_b6560+0x0 .loc 14 83 0 bar.sync 0; // .loc 14 84 0 add.u32 %r38, %r28, %r30; // ld.shared.f64 %fd4, [%r38+0]; // id:70 __cuda_b6560+0x0 st.global.f64 [%r24+0], %fd4; // id:71 .loc 14 86 0 ld.shared.f64 %fd5, [%r29+0]; // id:72 __cuda_a4384+0x0 st.global.f64 [%r36+0], %fd5; // id:73 $Lt_1_2562: .loc 14 89 0 exit; // $LDWend__Z14dinplace_T_oddPdii: } // _Z14dinplace_T_oddPdii architecture {sm_13} abiversion {1} modname {cubin} code { name = _Z15dinplace_T_evenPdii lmem = 0 smem = 4384 reg = 7 bar = 1 const { segname = const segnum = 1 offset = 0 bytes = 8 mem { 0x000003ff 0x00000011 } } bincode { 0xa0004e05 0x04200780 0xa0004c09 0x04200780 0x300203fd 0x640047c8 0x2000cc0d 0x04208780 0x203f8409 0x0fffffff 0xd0800205 0x00400780 0x1000020d 0x0403c280 0x2000cc09 0x04204500 0xa0000205 0x04000780 0x30040615 0xc4100780 0xa0000001 0x04000780 0x30040419 0xc4100780 0x20058208 0x2006800c 0x4142ea08 0x20038408 0x30030411 0xc4100780 0x6081000d 0x60404780 0x60810209 0x60400780 0x2000c811 0x04210780 0x00030605 0xc0000780 0x00030409 0xc0000780 0xd00e0809 0x80800780 0x30060bfd 0x6c0147c8 0x08001001 0xe4208780 0x08001201 0xe420c780 0x1001d003 0x00000280 0x861ffe03 0x00000000 0x1500f000 0x1500f204 0xd00e0801 0xa0800780 0x30000003 0x00000780 0x20018c04 0x20008a00 0x4141ea04 0x20008200 0x30030001 0xc4100780 0x2000c809 0x04200780 0xd00e0401 0x80800780 0x08045001 0xe4200780 0x08045201 0xe4204780 0x861ffe03 0x00000000 0x14045001 0x4400c780 0x14045205 0x4400c780 0xd00e0801 0xa0800780 0x1500f000 0x1500f204 0xd00e0401 0xa0800781 } } code { name = _Z14dinplace_T_oddPdii lmem = 0 smem = 4384 reg = 7 bar = 1 const { segname = const segnum = 1 offset = 0 bytes = 8 mem { 0x000003ff 0x00000011 } } bincode { 0x10000005 0x0403c780 0xa0004e09 0x04200780 0xa0004c15 0x04200780 0x300505fd 0x6400c7c8 0x2105ec0c 0x2102ec10 0xd0800601 0x00400780 0x1000040d 0x0403c280 0x203f8809 0x0fffffff 0xa0000001 0x04000780 0x30040611 0xc4100780 0x10000a09 0x0403c280 0xa0000405 0x04000780 0x2000000d 0x04010780 0x30040419 0xc4100780 0x4143ea08 0x2006820c 0x20000409 0x0400c780 0x30030415 0xc4100780 0x6081020d 0x60400780 0x60810009 0x60404780 0x2000c815 0x04214780 0x00030605 0xc0000780 0x00030409 0xc0000780 0xd00e0a09 0x80800780 0x300609fd 0x6c0147c8 0x08001001 0xe4208780 0x08001201 0xe420c780 0x1001f003 0x00000280 0x861ffe03 0x00000000 0x1500f000 0x1500f204 0xd00e0a01 0xa0800780 0x30000003 0x00000780 0x20008c08 0x20018800 0x4142ea04 0x20008200 0x30030001 0xc4100780 0x2000c809 0x04200780 0xd00e0401 0x80800780 0x08045001 0xe4200780 0x08045201 0xe4204780 0x861ffe03 0x00000000 0x14045001 0x4400c780 0x14045205 0x4400c780 0xd00e0a01 0xa0800780 0x1500f000 0x1500f204 0xd00e0401 0xa0800781 } } zP|  AB < AB X@AB t`AB AB PAB pAB AB 2AB AB 8AB TAB p0AB  6AB LFFGCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-44).symtab.strtab.shstrtab.rel.text.rel.data.bss.rel.ctors.rodata.str1.1.rodata.cst8.rel.nvFatBinSegment.rodata.rel.eh_frame.comment.note.GNU-stack@6 =)x % @ /84 @@?2iN_ < [ H@0 p`) |(2x x@x 3.447 : 30[< P%  Ubq      " "@"`"*Icp2" "6tmpxft_00006871_00000000-11_dinplace_transpose.ii_GLOBAL__I_magmablas_dinplace_transpose__sti____cudaRegisterAll_53_tmpxft_00006871_00000000_4_dinplace_transpose_cpp1_ii_98f6f52f__fatDeviceText__cudaUnregisterBinaryUtil__cudaFatCubinHandle_ZZ36__device_stub__Z14dinplace_T_oddPdiiE3__f_ZZ37__device_stub__Z15dinplace_T_evenPdiiE3__f__ptxEntries__cubinEntries__debugEntries__deviceText_$compute_13$__deviceText_$sm_13$min__gxx_personality_v0uminmaxumax__cudaRegisterFatBinaryatexit_Z21dinplace_T_odd__entryPdii__cudaRegisterFunction_Z22dinplace_T_even__entryPdii__cudaUnregisterFatBinary__device_stub__Z14dinplace_T_oddPdiicudaSetupArgumentcudaLaunch__device_stub__Z15dinplace_T_evenPdiirsqrtsqrtrsqrtfmagmablas_dinplace_transposecudaConfigureCall !19A"I!W_#%%%  &)%J%i%sw"~"&)(+%L%k%uy""&%% %  %&9>EJOT !"!,,3%V%y%""&%%% $( -&  $@\x <Xtdtranspose.cu_o/1250207510 500 500 100644 11436 ` ELF"4(UUE 9~]Ð&'UUE 9v]Ð&'UUE 9}]Ð&'UUE 9s]Ð&'U($$D$$D$ D$D$D$D$D$ D$D$$ÍU$Ðv'U(EED$D$$tÍE D$D$$uߍED$D$$uED$ D$$u$Ð&U]UEzt$==Í&'UE$]EÐUXEM]]uuEЋE}} E E‰EU܉EED$D$D$ D$D$$D$D$t]u}]ÍvEЉ]]uu} }E]U($$D$$D$ D$D$D$D$D$ D$D$$9D _Z13dtranspose_32PdiS_i46586cc1d29112e4dtranspose.cu compute_13sm_13?Z h̎)71) .version 1.4 .target sm_13 // compiled with /usr/local/cuda-2.2/open64/lib//be // nvopencc built on 2009-04-09 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_0000689d_00000000-7_dtranspose.cpp3.i (/tmp/ccBI#.RS8IYF) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:32 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_0000689d_00000000-6_dtranspose.cudafe2.gpu" .file 2 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 3 "/usr/local/cuda/bin//../include/crt/device_runtime.h" .file 4 "/usr/local/cuda/bin//../include/host_defines.h" .file 5 "/usr/local/cuda/bin//../include/builtin_types.h" .file 6 "/usr/local/cuda/bin//../include/device_types.h" .file 7 "/usr/local/cuda/bin//../include/driver_types.h" .file 8 "/usr/local/cuda/bin//../include/texture_types.h" .file 9 "/usr/local/cuda/bin//../include/vector_types.h" .file 10 "/usr/local/cuda/bin//../include/device_launch_parameters.h" .file 11 "/usr/local/cuda/bin//../include/crt/storage_class.h" .file 12 "/usr/include/bits/types.h" .file 13 "/usr/include/time.h" .file 14 "dtranspose.cu" .file 15 "/usr/local/cuda/bin//../include/common_functions.h" .file 16 "/usr/local/cuda/bin//../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin//../include/math_functions.h" .file 18 "/usr/local/cuda/bin//../include/device_functions.h" .file 19 "/usr/local/cuda/bin//../include/math_constants.h" .file 20 "/usr/local/cuda/bin//../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin//../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin//../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin//../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin//../include/math_functions_dbl_ptx3.h" .entry _Z13dtranspose_32PdiS_i ( .param .u32 __cudaparm__Z13dtranspose_32PdiS_i___val_paramB, .param .s32 __cudaparm__Z13dtranspose_32PdiS_i_ldb, .param .u32 __cudaparm__Z13dtranspose_32PdiS_i___val_paramA, .param .s32 __cudaparm__Z13dtranspose_32PdiS_i_lda) { .reg .u16 %rh<4>; .reg .u32 %r<43>; .reg .f64 %fd<10>; .shared .align 8 .b8 __cuda_a16[8448]; .loc 14 9 0 $LBB1__Z13dtranspose_32PdiS_i: mov.u32 %r1, __cuda_a16; // .loc 14 21 0 mov.u16 %rh1, %ctaid.x; // mul.wide.u16 %r2, %rh1, 32; // mov.u16 %rh2, %ctaid.y; // mul.wide.u16 %r3, %rh2, 32; // cvt.s32.u16 %r4, %tid.y; // mul24.lo.u32 %r5, %r4, 33; // cvt.s32.u16 %r6, %tid.x; // add.s32 %r7, %r6, %r2; // add.s32 %r8, %r4, %r3; // add.u32 %r9, %r6, %r5; // ld.param.s32 %r10, [__cudaparm__Z13dtranspose_32PdiS_i_lda]; // id:68 __cudaparm__Z13dtranspose_32PdiS_i_lda+0x0 mul24.lo.s32 %r11, %r8, %r10; // mul.lo.u32 %r12, %r9, 8; // add.s32 %r13, %r7, %r11; // add.u32 %r14, %r12, %r1; // mul.lo.u32 %r15, %r13, 8; // ld.param.u32 %r16, [__cudaparm__Z13dtranspose_32PdiS_i___val_paramA]; // id:73 __cudaparm__Z13dtranspose_32PdiS_i___val_paramA+0x0 add.u32 %r17, %r16, %r15; // ld.global.f64 %fd1, [%r17+0]; // id:74 st.shared.f64 [%r14+0], %fd1; // id:75 __cuda_a16+0x0 .loc 14 22 0 mul.lo.u32 %r18, %r10, 64; // add.u32 %r19, %r17, %r18; // ld.global.f64 %fd2, [%r19+0]; // id:76 st.shared.f64 [%r14+2112], %fd2; // id:77 __cuda_a16+0x0 .loc 14 23 0 mul.lo.u32 %r20, %r10, 128; // add.u32 %r21, %r17, %r20; // ld.global.f64 %fd3, [%r21+0]; // id:78 st.shared.f64 [%r14+4224], %fd3; // id:79 __cuda_a16+0x0 .loc 14 24 0 mul.lo.u32 %r22, %r10, 192; // add.u32 %r23, %r17, %r22; // ld.global.f64 %fd4, [%r23+0]; // id:80 st.shared.f64 [%r14+6336], %fd4; // id:81 __cuda_a16+0x0 .loc 14 26 0 bar.sync 0; // .loc 14 28 0 mul24.lo.u32 %r24, %r6, 33; // add.s32 %r25, %r4, %r2; // add.s32 %r26, %r6, %r3; // add.u32 %r27, %r4, %r24; // ld.param.s32 %r28, [__cudaparm__Z13dtranspose_32PdiS_i_ldb]; // id:67 __cudaparm__Z13dtranspose_32PdiS_i_ldb+0x0 mul24.lo.s32 %r29, %r25, %r28; // mul.lo.u32 %r30, %r27, 8; // add.s32 %r31, %r26, %r29; // add.u32 %r32, %r30, %r1; // mul.lo.u32 %r33, %r31, 8; // ld.param.u32 %r34, [__cudaparm__Z13dtranspose_32PdiS_i___val_paramB]; // id:82 __cudaparm__Z13dtranspose_32PdiS_i___val_paramB+0x0 add.u32 %r35, %r34, %r33; // ld.shared.f64 %fd5, [%r32+0]; // id:83 __cuda_a16+0x0 st.global.f64 [%r35+0], %fd5; // id:84 .loc 14 29 0 ld.shared.f64 %fd6, [%r32+64]; // id:85 __cuda_a16+0x0 mul.lo.u32 %r36, %r28, 64; // add.u32 %r37, %r35, %r36; // st.global.f64 [%r37+0], %fd6; // id:86 .loc 14 30 0 ld.shared.f64 %fd7, [%r32+128]; // id:87 __cuda_a16+0x0 mul.lo.u32 %r38, %r28, 128; // add.u32 %r39, %r35, %r38; // st.global.f64 [%r39+0], %fd7; // id:88 .loc 14 31 0 ld.shared.f64 %fd8, [%r32+192]; // id:89 __cuda_a16+0x0 mul.lo.u32 %r40, %r28, 192; // add.u32 %r41, %r35, %r40; // st.global.f64 [%r41+0], %fd8; // id:90 .loc 14 32 0 exit; // $LDWend__Z13dtranspose_32PdiS_i: } // _Z13dtranspose_32PdiS_i architecture {sm_13} abiversion {1} modname {cubin} code { name = _Z13dtranspose_32PdiS_i lmem = 0 smem = 8480 reg = 12 bar = 1 const { segname = const segnum = 1 offset = 0 bytes = 8 mem { 0x000003ff 0x00000021 } } bincode { 0xd0800205 0x00400780 0x41202c29 0x00000003 0x41202e2d 0x00000003 0xa0000005 0x04000780 0xa0000201 0x04000780 0x2001940c 0x20009608 0x6002ce09 0x8020c780 0x3003040d 0xc4100780 0x3006ce11 0xc4300780 0x3007ce15 0xc4300780 0x60810009 0x60404780 0x2103ec0c 0x2005881c 0x00030405 0xc0000780 0x20048610 0x20058618 0x20000621 0x0401c780 0xd00e0609 0x80800780 0xd00e0811 0x80800780 0xd00e0c19 0x80800780 0xd00e1021 0x80800780 0x04001001 0xe4208780 0x04001201 0xe420c780 0x04043001 0xe4210780 0x04043201 0xe4214780 0x04085001 0xe4218780 0x04085201 0xe421c780 0x040c7001 0xe4220780 0x040c7201 0xe4224780 0x861ffe03 0x00000000 0x200a8008 0x200b820c 0x60810201 0x60400780 0x6002ca05 0x8020c780 0x00030005 0xc0000780 0x30030209 0xc4100780 0x1500f000 0x1500f204 0x2000c809 0x04208780 0x3006ca0d 0xc4300780 0xd00e0401 0xa0800780 0x14003001 0x4400c780 0x14003205 0x4400c780 0x20000411 0x0400c780 0x3007ca15 0xc4300780 0xd00e0801 0xa0800780 0x20000611 0x04014780 0x14005001 0x4400c780 0x14005205 0x4400c780 0x2005840c 0x20048408 0xd00e0601 0xa0800780 0x14007001 0x4400c780 0x14007205 0x4400c780 0xd00e0401 0xa0800781 } } zP|  AB < AB X@AB t`AB zAB AB  AB 2AB  AB  @AB LFL@zAB GCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-44).symtab.strtab.shstrtab.rel.text.rel.data.bss.rel.ctors.rodata.str1.1.rodata.cst8.rel.nvFatBinSegment.rodata.rel.eh_frame.comment.note.GNU-stack@ *() % + /84 ,?2JN@_`< [ ,0 p |h Xx L,` !.!!% P(z+zKz<   *9 S    h"l "@"`"  $6A2"G L "S@htmpxft_0000689d_00000000-11_dtranspose.ii_GLOBAL__I_magmablas_dtranspose__sti____cudaRegisterAll_45_tmpxft_0000689d_00000000_4_dtranspose_cpp1_ii_d4ddcac6__fatDeviceText__cudaUnregisterBinaryUtil__cudaFatCubinHandle_ZZ37__device_stub__Z13dtranspose_32PdiS_iE3__f__ptxEntries__cubinEntries__debugEntries__deviceText_$compute_13$__deviceText_$sm_13$min__gxx_personality_v0uminmaxumax__cudaRegisterFatBinaryatexit_Z20dtranspose_32__entryPdiS_i__cudaRegisterFunction__cudaUnregisterFatBinary__device_stub__Z13dtranspose_32PdiS_icudaSetupArgumentcudaLaunchrsqrtsqrtrsqrtfmagmablas_dtransposecudaConfigureCall !I#j###$&  -%)%*/4lt| " "  $@\x Ddpermute.cu_o/ 1250207511 500 500 100644 11424 ` ELF!4(UUE 9~]Ð&'UUE 9v]Ð&'UUE 9}]Ð&'UUE 9s]Ð&'U($$D$$D$ D$D$D$D$D$ D$D$$ÍU$Ðv'U(ED$D$E$t$Ðt&UEzt$==Í&'UE$]EÐU($$D$$D$ D$D$D$D$D$ D$D$$ÍU(EED$D$$t$Ðt&UVSuVEEE@?EEED$D$D$ @D$D$$D$D$t [^]ÍLD$t$$LED$D$$u$[^]ÍUWVS|EE HDž?‹EUD$1D$<$Ef)fD_U@uዕEEE@UEED$D$D$ @D$D$$D$D$t+@9U|[^_]ËU D$|$TEPL$HED$D$$h$M=H`_Z9mydlaswp_15dlaswp_params_t7ae718e2630785cddpermute.cu compute_13sm_13?Z h̎/;1 .version 1.4 .target sm_13 // compiled with /usr/local/cuda-2.2/open64/lib//be // nvopencc built on 2009-04-09 .reg .u32 %ra<17>; .reg .u64 %rda<17>; .reg .f32 %fa<17>; .reg .f64 %fda<17>; .reg .u32 %rv<5>; .reg .u64 %rdv<5>; .reg .f32 %fv<5>; .reg .f64 %fdv<5>; //----------------------------------------------------------- // Compiling /tmp/tmpxft_000068c9_00000000-7_dpermute.cpp3.i (/tmp/ccBI#.qeWeXC) //----------------------------------------------------------- //----------------------------------------------------------- // Options: //----------------------------------------------------------- // Target:ptx, ISA:sm_13, Endian:little, Pointer Size:32 // -O3 (Optimization level) // -g0 (Debug level) // -m2 (Report advisories) //----------------------------------------------------------- .file 1 "/tmp/tmpxft_000068c9_00000000-6_dpermute.cudafe2.gpu" .file 2 "dpermute.cu" .file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.1.2/include/stddef.h" .file 4 "/usr/local/cuda/bin//../include/crt/device_runtime.h" .file 5 "/usr/local/cuda/bin//../include/host_defines.h" .file 6 "/usr/local/cuda/bin//../include/builtin_types.h" .file 7 "/usr/local/cuda/bin//../include/device_types.h" .file 8 "/usr/local/cuda/bin//../include/driver_types.h" .file 9 "/usr/local/cuda/bin//../include/texture_types.h" .file 10 "/usr/local/cuda/bin//../include/vector_types.h" .file 11 "/usr/local/cuda/bin//../include/device_launch_parameters.h" .file 12 "/usr/local/cuda/bin//../include/crt/storage_class.h" .file 13 "/usr/include/bits/types.h" .file 14 "/usr/include/time.h" .file 15 "/usr/local/cuda/bin//../include/common_functions.h" .file 16 "/usr/local/cuda/bin//../include/crt/func_macro.h" .file 17 "/usr/local/cuda/bin//../include/math_functions.h" .file 18 "/usr/local/cuda/bin//../include/device_functions.h" .file 19 "/usr/local/cuda/bin//../include/math_constants.h" .file 20 "/usr/local/cuda/bin//../include/sm_11_atomic_functions.h" .file 21 "/usr/local/cuda/bin//../include/sm_12_atomic_functions.h" .file 22 "/usr/local/cuda/bin//../include/sm_13_double_functions.h" .file 23 "/usr/local/cuda/bin//../include/texture_fetch_functions.h" .file 24 "/usr/local/cuda/bin//../include/math_functions_dbl_ptx3.h" .entry _Z9mydlaswp_15dlaswp_params_t ( .param .align 4 .b8 __cudaparm__Z9mydlaswp_15dlaswp_params_t___val_paramparams[144]) { .reg .u32 %r<30>; .reg .f64 %fd<4>; .reg .pred %p<5>; .local .align 4 .b8 __cuda___cuda_params_16144[144]; .loc 2 17 0 $LBB1__Z9mydlaswp_15dlaswp_params_t: mov.u32 %r1, __cudaparm__Z9mydlaswp_15dlaswp_params_t___val_paramparams; // mov.u32 %r2, __cuda___cuda_params_16144; // mov.s32 %r3, %r1; // mov.s32 %r4, %r2; // mov.s32 %r5, 36; // $L_0_3586: // Loop body line 17, nesting depth: 1, iterations: 36 ld.param.u32 %r6, [%r3+0]; // id:56 __cudaparm__Z9mydlaswp_15dlaswp_params_t___val_paramparams+0x0 st.local.u32 [%r4+0], %r6; // id:55 __cuda___cuda_params_16144+0x0 add.u32 %r3, %r3, 4; // add.u32 %r4, %r4, 4; // sub.s32 %r5, %r5, 1; // mov.u32 %r7, 0; // setp.gt.s32 %p1, %r5, %r7; // @%p1 bra $L_0_3586; // cvt.s32.u16 %r8, %ntid.x; // cvt.s32.u16 %r9, %ctaid.x; // mul24.lo.s32 %r10, %r8, %r9; // cvt.u32.u16 %r11, %tid.x; // add.u32 %r12, %r10, %r11; // ld.local.u32 %r13, [__cuda___cuda_params_16144+4]; // id:47 __cuda___cuda_params_16144+0x4 setp.le.u32 %p2, %r13, %r12; // @%p2 bra $Lt_0_2050; // ld.local.s32 %r14, [__cuda___cuda_params_16144+8]; // id:41 __cuda___cuda_params_16144+0x8 ld.local.s32 %r15, [__cuda___cuda_params_16144+12]; // id:48 __cuda___cuda_params_16144+0xc mul.lo.s32 %r16, %r15, %r14; // mul.lo.u32 %r17, %r14, 8; // mov.u32 %r18, __cuda___cuda_params_16144; // mov.u32 %r19, __cuda___cuda_params_16144+128; // add.u32 %r20, %r16, %r12; // mul.lo.u32 %r21, %r20, 8; // ld.local.u32 %r22, [__cuda___cuda_params_16144+0]; // id:49 __cuda___cuda_params_16144+0x0 add.s32 %r23, %r22, %r21; // mov.s32 %r24, %r23; // $Lt_0_3074: // Loop body line 17, nesting depth: 1, iterations: 64 .loc 2 30 0 ld.global.f64 %fd1, [%r24+0]; // id:50 .loc 2 31 0 ld.local.s16 %r25, [%r18+16]; // id:51 __cuda___cuda_params_16144+0x0 mul.lo.s32 %r26, %r25, %r14; // mul.lo.u32 %r27, %r26, 8; // add.u32 %r28, %r23, %r27; // ld.global.f64 %fd2, [%r28+0]; // id:52 st.global.f64 [%r24+0], %fd2; // id:53 .loc 2 32 0 st.global.f64 [%r28+0], %fd1; // id:54 add.u32 %r18, %r18, 2; // add.s32 %r24, %r24, %r17; // setp.ne.u32 %p3, %r18, %r19; // @%p3 bra $Lt_0_3074; // $Lt_0_2050: .loc 2 35 0 exit; // $LDWend__Z9mydlaswp_15dlaswp_params_t: } // _Z9mydlaswp_15dlaswp_params_t architecture {sm_13} abiversion {1} modname {cubin} code { name = _Z9mydlaswp_15dlaswp_params_t lmem = 144 smem = 160 reg = 10 bar = 0 const { segname = const segnum = 1 offset = 0 bytes = 4 mem { 0x00000080 } } bincode { 0x10108005 0x00000003 0x00000205 0xc0000780 0x0000f809 0xc0000780 0x10248009 0x00000003 0x16000205 0x4400c780 0x203f8409 0x0fffffff 0xd8000005 0x60c00780 0x307c05fd 0x6c0107c8 0xd8000809 0x20000780 0x10004003 0x00000280 0xa0004205 0x04200780 0xa0004c09 0x04200780 0x40020205 0x00018780 0xa0000001 0x04000780 0x20000201 0x04000780 0xd0000805 0x40c00780 0x300003fd 0x6400c7c8 0x30000003 0x00000280 0xd0001009 0x40c00780 0xd000180d 0x40c00780 0xd0000005 0x40c00780 0x40050c11 0x00000780 0x60040e11 0x00010780 0x30100811 0xc4100780 0x60040c0d 0x00010780 0x20000601 0x04000780 0x30030001 0xc4100780 0x20000205 0x04000780 0x1000f821 0x0603c780 0x3003040d 0xc4100780 0x10000201 0x0403c780 0x00001005 0xc0000780 0xd4002011 0x40600780 0x40051015 0x00000780 0x60041215 0x00014780 0x30100a15 0xc4100780 0x60041011 0x00014780 0x30030811 0xc4100780 0x20000025 0x04010780 0xd00e0211 0x80800780 0xd00e1219 0x80800780 0x20029021 0x00000003 0xd00e0219 0xa0800780 0x308011fd 0x644147c8 0xd00e1211 0xa0800780 0x20000205 0x0400c780 0x1001f003 0x00000280 0xf0000001 0xe0000001 } } zP|  AB < AB X@AB t`AB zAB AB  KAB p2AB AB zAB 8PKAB TAB HtAB IGCC: (GNU) 4.1.2 20080704 (Red Hat 4.1.2-44).symtab.strtab.shstrtab.rel.text.rel.data.bss.rel.ctors.rodata.str1.1.rodata.cst8.rel.nvFatBinSegment.rodata.rel.eh_frame.comment.note.GNU-stack@g @*) % + /84 +?2NN _@< [ ,0 p |Xx 0,p  .!!$ ')z<z<   .H H`x   ]"av "{@"`"PK K&8Cp2"I N"U]o v tmpxft_000068c9_00000000-11_dpermute.ii_GLOBAL__I_dlaswp2__sti____cudaRegisterAll_43_tmpxft_000068c9_00000000_4_dpermute_cpp1_ii_ddc25250__fatDeviceText__cudaUnregisterBinaryUtil__cudaFatCubinHandle_ZZ43__device_stub__Z9mydlaswp_15dlaswp_params_tE3__f__ptxEntries__cubinEntries__debugEntries__deviceText_$compute_13$__deviceText_$sm_13$min__gxx_personality_v0uminmaxumax__cudaRegisterFatBinaryatexit_Z16mydlaswp___entry15dlaswp_params_t__cudaRegisterFunction__cudaUnregisterFatBinary__device_stub__Z9mydlaswp_15dlaswp_params_tcudaSetupArgumentcudaLaunchrsqrtsqrtrsqrtfdlaswp2cudaConfigureCallmemcpymagmablas_dpermute_longmemset !I#UY`e$&  %,4<D y#$#)K*l#w}$,)*@#OUY^$  $@\x <Xx