Code: Select all
int main() {
magma_init (); // initialize Magma
magma_queue_t queue = NULL ;
magma_int_t dev =0;
magma_queue_create (dev ,& queue );
magma_int_t m = 3;
magma_int_t n = 2;
magma_int_t batchSize = 2;//2 small matrices
magma_int_t lda = 3;
magma_int_t *info;
magma_imalloc_cpu(&info,batchSize);
//step -1 Allocate storage for batch count
double **A,**tau;
A = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++) {
A[i] = (double*)malloc(m*n * sizeof(double));
}
tau = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++)
{
tau[i] = (double*)malloc(n * sizeof(double));
}
//info = (int*)malloc(batchSize * sizeof(int));
//step -2 create host pointer array to the gpu array
double **d_A, **h_d_A,**h_d_tau,**d_tau;
h_d_A = (double**)malloc(batchSize * sizeof(double*));
h_d_tau = (double**)malloc(batchSize * sizeof(double*));
for (int i = 0; i < batchSize; i++) {
cudaMalloc((void**)&h_d_A[i], m*n * sizeof(double));
h_d_tau[i] = (double*)malloc(n * sizeof(double));
}
//step -3 copy host array of pointers to device
cudaMalloc((void**)&d_A, batchSize * sizeof(double*));
cudaMalloc((void**)&d_tau, batchSize * sizeof(double*));
cudaMemcpy(d_A, h_d_A, batchSize * sizeof(double*), cudaMemcpyHostToDevice);
cudaMemcpy(d_tau, h_d_tau, batchSize * sizeof(double*), cudaMemcpyHostToDevice);
//fill up the matrix A
for (int k = 0; k < batchSize; k++) {
for (int j = 0; j < n; j++) {
for (int i = 0; i < m; i++) {
int index = j * m + i;//not tested
if (i == j) {
(A[k])[index] = 2.0000000;
}
else {
(A[k])[index] = 1.0000000;
}
} // i
} // j
} // k
for (int i = 0; i < batchSize; i++)
{
magma_dsetmatrix( m, n, A[i], lda, h_d_A[i], lda ,queue);
}
magma_dgeqrf_batched( m, n, d_A, lda, d_tau, info, batchSize,queue);
for (int i = 0; i < batchSize; i++)
{
magma_dgetmatrix(m, n, h_d_A[i], lda, A[i], lda,queue);
//cudaMemcpy(A[i], h_d_A[i], m*n * sizeof(double), cudaMemcpyDeviceToHost);
}
//print the A matrix
for (int k = 0; k < batchSize; k++) {
for (int j = 0; j < m; j++) {
for (int i = 0; i < n; i++) {
int index = j * m + i;//not tested
//count = count + 1;
printf("\n %d The values are %lf",k+index, A[k][index]);
} // i
} // j
} // k
free(tau);
free(A);
free(h_d_A);
free(h_d_tau);
cudaFree(d_A);
cudaFree(d_tau);
}