27 PLASMA_Complex64_t *
A;
29 PLASMA_Complex64_t *
T;
31 PLASMA_Complex64_t *
TAU;
32 PLASMA_Complex64_t *WORK;
40 A = (PLASMA_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
41 T = (PLASMA_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
44 WORK = TAU +
min( M, N );
46 CORE_zgeqrt(M, N, IB, A, LDA, T, LDT, TAU, WORK);
52 #if defined(MORSE_USE_MULTICORE) && 0
53 static void cl_zgeqrt_mc_func(
void *descr[],
void *cl_arg)
61 #if defined(MORSE_USE_CUDA) && 0
62 static void cl_zgeqrt_cuda_func(
void *descr[],
void *cl_arg)
67 cuDoubleComplex *h_A, *d_A;
69 cuDoubleComplex *h_T, *d_T;
72 cuDoubleComplex *h_TAU;
73 cuDoubleComplex *h_WORK, *d_WORK;
87 &scratch_tau, &scratch_work,
88 &scratch_h_work, &scratch_h_a,
89 &scratch_h_T, &scratch_h_D, &scratch_d_D);
92 d_A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
93 d_T = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
103 memset(h_A, 0, M*N *
sizeof(cuDoubleComplex));
104 memset(h_T, 0, IB*N *
sizeof(cuDoubleComplex));
105 memset(h_D, 0, IB*M *
sizeof(cuDoubleComplex));
106 memset(h_TAU, 0, M *
sizeof(cuDoubleComplex));
107 memset(h_WORK, 0, 2*M*M*
sizeof(cuDoubleComplex));
110 cudaMemcpy(h_A, d_A, M*IB*
sizeof(cuDoubleComplex), cudaMemcpyDeviceToHost);
124 cudaMemcpy(d_D, h_D, IB*M*
sizeof(cuDoubleComplex), cudaMemcpyHostToDevice);
125 splagma_zload_d_into_tile(M, IB, d_A, d_D);
126 cudaThreadSynchronize();
140 int m,
int n,
int ib,
144 starpu_codelet *zgeqrt_codelet;
145 void (*callback)(
void*) = options->profiling ? cl_zgeqrt_callback : NULL;
146 int lda =
BLKLDD( A, Am );
147 int ldt =
BLKLDD( T, Tm );
151 #ifdef MORSE_USE_MULTICORE
152 zgeqrt_codelet = options->parallel ? &cl_zgeqrt_mc : &cl_zgeqrt;
154 zgeqrt_codelet = &cl_zgeqrt;
159 VALUE, &m,
sizeof(
int),
160 VALUE, &n,
sizeof(
int),
161 VALUE, &ib,
sizeof(
int),
163 VALUE, &lda,
sizeof(
int),
165 VALUE, &ldt,
sizeof(
int),