Problem with magmablas_dgemvt

bjsmith · Post by **bjsmith** » Wed Jul 07, 2010 12:35 pm

I gather that magmablas_dgemvt is supposed to compute y = A^t x, where A is n x m, x is n x 1, and y is m x 1. However, the function appears to compute only the first n elements of y. Thus, results are correct if m <= n, but incorrect otherwise. Below is a program, and its output, to illustrate the problem.

The main reason for bringing this up is because page 55 of the manual states that the MAGMA BLAS library includes "only certain kernels that are crucial for the performance of MAGMA routines." Hence, I am wondering if there are problems with other MAGMA routines that could be attributable to their calling dgemvt, such as the problem I am having with magma_dgeqrs/dgeqrf_gpu2.

On the other hand, the documentation for dgemvt is a bit unclear to me, so there is a chance that I am not using the function as intended.

Code: Select all

#include <stdio.h>
#include <stdlib.h>

#include <cuda.h>
#include <cublas.h>
#include <magma.h>

/***************************************************************************
 * Program to compute y = A^t x.
 * - magmablas_dgemvt fails
 * - cublasDgemv passes
 ***************************************************************************/

void mprint(int m, int n, double *x, int ldx) {
   int i, j;
   for(i = 0; i < m; i++) {
      for(j = 0; j < n; j++) {
         printf("%f ", x[i + j * ldx]);
      }
      printf("\n");
   }
}

#define n 2    // Number of rows in A
#define m 5    // Number of columns in A

int main(int argc, char** argv)
{
   int lda = n, i; 
   double A[n * m], x[n], y[m], zeros[m], *d_A, *d_x, *d_y;

   for(i = 0; i < (n * m); i++) A[i] = i;
   for(i = 0; i < n; i++) x[i] = 1.0;
   for(i = 0; i < m; i++) zeros[i] = 0.0;

   printf("Matrix A:\n");
   mprint(n, m, A, lda);

   printf("\nVector x:\n");
   mprint(n, 1.0, x, n);

   cublasInit();

   cublasAlloc(n * m, sizeof(double), (void **)&d_A);
   cublasAlloc(n, sizeof(double), (void **)&d_x);
   cublasAlloc(m, sizeof(double), (void **)&d_y);

   if(d_A == NULL || d_x == NULL || d_y == NULL) {
      fprintf(stderr, "Error: CUDA memory allocation failed\n");
      exit(1);
   }

  /***************************************************************************
   * MAGMA Matrix-Vector Multiplication
   * - Incorrectly returns only first m elements of the n-dimensional array z
   ***************************************************************************/

   printf("\nMAGMA Result\n=============\n");

   cublasSetVector(n * m, sizeof(double), A, 1, d_A, 1);
   cublasSetVector(n, sizeof(double), x, 1, d_x, 1);
   cublasSetVector(m, sizeof(double), zeros, 1, d_y, 1);

   magmablas_dgemvt(n, m, 1.0, d_A, lda, d_x, d_y);
   cublasGetVector(m, sizeof(double), d_y, 1, y, 1);

   printf("\nA^t x (magmablas_dgemvt):\n");
   mprint(m, 1.0, y, m);

  /***************************************************************************
   * CUBLAS Matrix-Vector Multiplication
   * - Returns correct result
   ***************************************************************************/

   printf("\nCUBLAS Result\n==============\n");

   cublasSetVector(m, sizeof(double), zeros, 1, d_y, 1);

   cublasDgemv('T', n, m, 1.0, d_A, lda, d_x, 1, 0.0, d_y, 1);
   cublasGetVector(m, sizeof(double), d_y, 1, y, 1);

   printf("\nA^t x (cublasDgemv):\n");
   mprint(m, 1.0, y, m);

   cublasFree(d_A);
   cublasFree(d_x);
   cublasFree(d_y);

   cublasShutdown();

   return(EXIT_SUCCESS);
}

Program output:

Matrix A:
0.000000 2.000000 4.000000 6.000000 8.000000
1.000000 3.000000 5.000000 7.000000 9.000000

Vector x:
1.000000
1.000000

MAGMA Result
=============

A^t x (magmablas_dgemvt):
1.000000
5.000000
0.000000
0.000000
0.000000

CUBLAS Result
==============

A^t x (cublasDgemv):
1.000000
5.000000
9.000000
13.000000
17.000000

Stan Tomov · Post by **Stan Tomov** » Thu Aug 05, 2010 3:33 pm

Hi,
Thanks for this bug report. The correct code is below.
A number of bugs have been found and fixed so far.
We are preparing our next release, that would include support
for Fermi GPUs as well.
Stan

Code: Select all

/*
    -- MAGMA (version 0.2) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2009
*/

#include "cublas.h"
#include "magma.h"

#define num_threads 64
#define dgemv_bs 64

__global__ void 
dgemv_kernel(int n, int m, int n1, double* A, int lda, double *x, double *y)
{
  int ind = blockIdx.x*num_threads + threadIdx.x;

  A += ind;
  x += threadIdx.x;

  double res = 0.f;

  __shared__ double buff[dgemv_bs];
  for(int i=0; i<n1; i += dgemv_bs ){
    __syncthreads();
    buff[threadIdx.x]  = x[i];

    __syncthreads();
    #pragma unroll
    for(int j=0; j < dgemv_bs ; j++){
       res+=A[0]*buff[j];
       A+=lda;
    }
  }
  __syncthreads();

  if (m>n1){
     buff[threadIdx.x]  = x[n1];

     __syncthreads();
     for(int j=0; j<(m-n1); j++){
         res += A[0]*buff[j];
         A+=lda;
     }
  }

  if (ind<n)
     y[ind] = res;
}

extern "C" void
magmablas_dgemv(int n, int m, double *A, int lda, double *x, double *z)
{
/*  -- MAGMA (version 0.2) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2009

    Purpose
    =======

    This routine computes z = A x on the GPU.

    N      - (input) INTEGER.
             On entry, N specifies the number of rows of the matrix A.

    M      - (input) INTEGER.
             On entry, M specifies the number of columns of the matrix A

    A      - (input) DOUBLE PRECISION array of dimension ( LDA, m ) on the GPU.
   
    LDA    - (input) INTEGER.
             LDA specifies the leading dimension of A.

    X      - (input) DOUBLE PRECISION array of dimension n.
     
    Z      - (output) DOUBLE PRECISION array of	dimension m. 
             On exit Z = A X.

    ===================================================================== */

    int blocks;
    if (n % num_threads==0)
        blocks = n/num_threads;
    else
        blocks = n/num_threads + 1;

    dim3 grid(blocks, 1, 1);
    dim3 threads(num_threads, 1, 1);
 
    dgemv_kernel<<<grid, threads>>>(n, m, (m / dgemv_bs)*dgemv_bs, 
                                    A, lda, x, z);
}

__global__ void
dgemvt_kernel1(int n, int m, double alpha, int n1, double* A, int lda,
              double *x, double *y)
{
  const int inx = threadIdx.x;
  const int iny = threadIdx.y;

  int ind  = iny + __mul24(blockIdx.x,32);
  ind = inx + __mul24(ind,lda);
  int ind2 = inx + __mul24(iny,32);

  A += ind;
  x += ind2;

  double res = 0.f;

  __shared__ double buff[dgemv_bs];
  __shared__ double la[32][33];

  for(int i=0; i<n1; i += dgemv_bs ){
      buff[ind2]  = x[i];
      #pragma unroll
      for(int j=0; j<16; j++)
         la[iny+__mul24(j,2)][inx] = A[j*__mul24(2,lda)];

      __syncthreads();
      #pragma unroll
      for(int j=0; j < 16; j++)
        res += la[inx][iny*16+j]*buff[j+iny*16];

      A += 32;

      //===============================================
      #pragma unroll
      for(int j=0; j<16; j++)
         la[iny+__mul24(j,2)][inx] = A[j*__mul24(2,lda)];

      __syncthreads();

      #pragma unroll
      for(int j=0; j < 16; j++)
        res += la[inx][iny*16+j]*buff[j+32+iny*16];
      A += 32;
    }

    if (n>n1){
      if (ind2>=(n-n1))
         buff[ind2]=0.;
      else
         buff[ind2]  = x[n1];

      #pragma unroll
      for(int j=0; j<16; j++)
         la[iny+__mul24(j,2)][inx] = A[j*__mul24(2,lda)];

     __syncthreads();

     if (n-n1>16){
	#pragma unroll
        for(int j=0; j < 16; j++)
           res += la[inx][iny*16+j]*buff[j+iny*16];

        A += 32;
        #pragma unroll
        for(int j=0; j<16; j++)
          la[iny+__mul24(j,2)][inx] = A[j*__mul24(2,lda)];

	__syncthreads();

        #pragma unroll
        for(int j=0; j < 16; j++)
           res += la[inx][iny*16+j]*buff[j+32+iny*16];
     }
     else {
        #pragma unroll
        for(int j=0; j < 16; j++)
          res += la[inx][iny*16+j]*buff[j+iny*16];
     }
  }
  ind = inx + __mul24(blockIdx.x,32);

  la[inx][iny]= res;
  if (ind<m){
     res = la[inx][0] + la[inx][1];
     y[ind] = alpha*res;
  }
}

__global__ void
dgemvt_kernel2(int n, int m, double alpha,
		int n1, double* A, int lda, double *x, double *y)
{
  const int inx = threadIdx.x;
  const int iny = threadIdx.y;

  int ind  = iny + __mul24(blockIdx.x,16);
  ind = inx + __mul24(ind,lda);
  int ind2 = inx + __mul24(iny,16);
  if (ind2>31)
     ind2-=32;

  A += ind;
  x += ind2;
  if (ind2>31)
     ind2-=32;

  double res = 0.f;

  __shared__ double buff[32];
  __shared__ double la[16][17];

  for(int i=0; i<n1; i += 32 ){
     buff[ind2]  = x[i];
     #pragma unroll
     for(int j=0; j<4; j++)
        la[iny+__mul24(j,4)][inx] = A[j*__mul24(4,lda)];

     __syncthreads();
     #pragma unroll
     for(int j=0; j < 4; j++)
       res += la[inx][iny*4+j]*buff[j+iny*4];

     A += 16;
    __syncthreads();
     //===========================================
     #pragma unroll
     for(int j=0; j<4; j++)
         la[iny+__mul24(j,4)][inx] = A[j*__mul24(4,lda)];

     __syncthreads();

     #pragma unroll
     for(int j=0; j < 4; j++)
        res += la[inx][iny*4+j]*buff[j+16+iny*4];
     A += 16;
  }

  if (n>n1){
     if (ind2>=(n-n1))
        buff[ind2]=0.;
     else
        buff[ind2]  = x[n1];

     __syncthreads();
     #pragma unroll
     for(int j=0; j<4; j++)
         la[iny+__mul24(j,4)][inx] = A[j*__mul24(4,lda)];

     __syncthreads();
     if (n-n1>4){
        #pragma unroll
	for(int j=0; j < 4; j++)
           res += la[inx][iny*4+j]*buff[j+iny*4];

        A += 16;
        __syncthreads();
        #pragma unroll
          for(int j=0; j<4; j++)
            la[iny+__mul24(j,4)][inx] = A[j*__mul24(4,lda)];

        __syncthreads();

        #pragma unroll
        for(int j=0; j < 4; j++)
           res += la[inx][iny*4+j]*buff[j+16+iny*4];
     }
     else {
        #pragma unroll
        for(int j=0; j < 4; j++)
          res += la[inx][iny*4+j]*buff[j+iny*4];
     }
  }

  __syncthreads();
  ind = inx + __mul24(blockIdx.x,16);
  la[inx][iny]= res;
  __syncthreads();
  if (ind<m){
     res = la[inx][0] + la[inx][1] + la[inx][2] + la[inx][3];
     y[ind] = alpha*res;
  }
}

extern "C" void
magmablas_dgemvt1(int m, int n, double alpha, double *A, int lda,
                  double *x, double *z)
{
/*  -- MAGMA (version 0.2) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2009

    Purpose
    =======

    This routine computes z = alpha A^t x on the GPU.
    Recommended for large M and N.

    M      - (input) INTEGER.
             On entry, N specifies the number of rows of the matrix A.

    N      - (input) INTEGER.
             On entry, M specifies the number of columns of the matrix A

    A      - (input) DOUBLE PRECISION array of dimension ( LDA, n ) on the GPU.

    LDA    - (input) INTEGER.
             LDA specifies the leading dimension of A.

    X      - (input) DOUBLE PRECISION array of dimension m.

    Z      - (output) DOUBLE PRECISION array of dimension n.
             On exit Z = alpha A^t X.

    ===================================================================== */
    int blocks;

    if (n % 32==0)
        blocks = n/32;
    else
        blocks = n/32 + 1;

    dim3 grid(blocks, 1, 1);
    dim3 threads(32, 2, 1);
    dgemvt_kernel1<<<grid, threads>>>(m, n, alpha, (m / dgemv_bs)*dgemv_bs,
                                      A, lda, x, z);
}

extern "C" void
magmablas_dgemvt2(int m, int n, double alpha, double *A, int lda,
                  double *x, double *z)
{
/*  -- MAGMA (version 0.2) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2009

    Purpose
    =======

    This routine computes z = alpha A^t x on the GPU. Used in least squares
    solver for N small (e.g. = BS, a block size of order 64, 128, etc).

    M      - (input) INTEGER.
             On entry, N specifies the number of rows of the matrix A.

    N      - (input) INTEGER.
             On entry, M specifies the number of columns of the matrix A

    A      - (input) DOUBLE PRECISION array of dimension ( LDA, n ) on the GPU.

    LDA    - (input) INTEGER.
             LDA specifies the leading dimension of A.

    X      - (input) DOUBLE PRECISION array of dimension m.

    Z      - (output) DOUBLE PRECISION array of dimension n.
             On exit Z = alpha A^t X.

    ===================================================================== */

    int blocks;

    if (n % 16==0)
        blocks = n/16;
    else
        blocks = n/16 + 1;

    dim3 grid(blocks, 1, 1);
    dim3 threads(16, 4, 1);

    dgemvt_kernel2<<<grid, threads>>>(m, n, alpha, (m / 32)*32,
                                      A, lda, x, z);
}

extern "C" void
magmablas_dgemvt(int m, int n, double alpha, double *A, int lda,
                 double *x, double *z)
{
/*  -- MAGMA (version 0.2) --
       Univ. of Tennessee, Knoxville
       Univ. of California, Berkeley
       Univ. of Colorado, Denver
       November 2009

    Purpose
    =======

    This routine computes z = alpha A^t x on the GPU.

    M      - (input) INTEGER.
             On entry, N specifies the number of rows of the matrix A.

    N      - (input) INTEGER.
             On entry, M specifies the number of columns of the matrix A

    A      - (input) SINGLE PRECISION array of dimension ( LDA, n ) on the GPU.

    LDA    - (input) INTEGER.
             LDA specifies the leading dimension of A.

    X      - (input) SINGLE PRECISION array of dimension m.

    Z      - (output) SINGLE PRECISION array of dimension n.
             On exit Z = alpha A^t X.

    ===================================================================== */

    if (n<=128)
      magmablas_dgemvt2(m, n, alpha, A, lda, x, z);
    else
      magmablas_dgemvt1(m, n, alpha, A, lda, x, z);
}

#undef num_threads
#undef dgemv_bs

MAGMA Forum

Problem with magmablas_dgemvt

Problem with magmablas_dgemvt

Re: Problem with magmablas_dgemvt