Hardware
========
GPU : C2050 --- 14 Multiprocessors ( x 32 CUDA cores)        @ 1.15 GHz    
CPU : yona  --- 2 x Six -Core AMD Opteron(tm) Processor 2435 @ 2.6  GHz
      disco --- 1 x Quad-Core Intel Core2 Processor    Q9300 @ 2.50 GHz

Software
========
GPU : CUDA  3.2 + MAGMA 1.0 
CPU : MKL  11.1 + MAGMA 1.0

//////////////////////////////////////////////////////////////////

spotrf_gpu (C2050, disco@utk)
  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F
========================================================
 1024     40.65          72.47        4.598919e-08
 2048     52.90         199.98        4.912413e-08
 3072     59.06         281.52        5.113575e-08
 4032     62.48         329.67        5.394260e-08
 5184     63.79         375.49        5.537155e-08
 6048     65.04         399.62        5.792269e-08
 7200     65.34         437.71        5.965600e-08
 8064     67.60         443.53        6.124631e-08
 8928     68.03         461.27        6.407727e-08
10240     65.49         480.40        6.795680e-08

sgetrf_gpu (C2050, disco@utk)
  M     N   CPU GFlop/s    GPU GFlop/s   ||PA-LU||/(||A||*N)
============================================================
  960   960   43.14          42.19         2.242990e-09
 1920  1920   50.30         120.58         1.944728e-09
 3072  3072   46.15         198.43         1.814167e-09
 4032  4032   44.80         275.54         2.051052e-09
 4992  4992   46.31         330.48         1.966408e-09
 5952  5952   47.12         373.41         1.885595e-09
 7104  7104   50.20         407.96         1.841615e-09
 8064  8064   50.03         424.27         1.944891e-09
 9024  9024   50.82         448.38         2.127017e-09
 9984  9984   51.33         463.11         2.287123e-09

sgeqrf_gpu (C2050, disco@utk)
  M     N   CPU GFlop/s   GPU GFlop/s    ||R||_F / ||A||_F
==========================================================
 1024  1024   21.68          85.40        1.691922e-07
 2048  2048   29.44         185.76        4.265398e-08
 3072  3072   33.42         288.98        1.836671e-08
 4032  4032   39.62         351.60        1.522754e-08
 5184  5184   41.31         404.80        1.269924e-08
 6016  6016   41.80         428.09        1.309676e-08
 7040  7040   42.21         450.42        7.601059e-08
 8064  8064   42.26         473.26        1.018627e-08
 9088  9088   42.78         487.70        9.882160e-09
 9984  9984   42.73         500.60        1.039224e-08

sorgqr (C2050, disco@utk)
  M     N     MAGMA CPU    MAGMA GPU     ||R|| / ||A||
=======================================================
 1024  1024     89.1        130.8         7.22e-09 
 2048  2048    143.9        253.0         5.44e-09 
 3072  3072    214.4        375.8         2.38e-09 
 4032  4032    292.9        437.5         1.77e-09 
 5184  5184    365.3        477.3         1.37e-09 
 6016  6016    401.9        489.9         1.22e-09 
 7040  7040    435.0        502.7         1.03e-09 
 8064  8064    446.6        512.4         9.65e-10 
 9088  9088    442.0        520.9         9.78e-10 
 9984  9984    443.4        531.7         9.85e-10 


  M     N    CPU GFlop/s   GPU GFlop/s   ||R|| / ||A||
=======================================================
 1024  1024     20.3         99.7         7.36e-09 
 2048  2048     17.6        207.5         5.68e-09 
 3072  3072     19.8        305.2         2.63e-09 
 4032  4032     22.8        383.1         1.95e-09 
 5184  5184     23.3        433.3         1.52e-09 
 6016  6016     23.3        452.3         1.35e-09 
 7040  7040     23.4        471.8         1.15e-09 
 8064  8064     24.2        489.0         9.66e-10 
 9088  9088     25.5        499.9         9.78e-10 
 9984  9984     26.0        512.4         9.86e-10 

//////////////////////////////////////////////////////////////////

cpotrf_gpu (C2050, yona@ornl)
  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F
========================================================
 1024     39.11          98.88        4.089335e-08
 2048    106.62         265.85        5.565143e-08
 3072    151.73         351.27        4.023858e-08
 4032    155.97         398.04        3.366742e-08
 5184    164.01         446.36        5.200750e-08
 6048    167.82         470.39        4.751503e-08
 7200    167.79         501.65        4.519429e-08
 8064    171.12         537.54        4.269581e-08
 8928    169.67         536.81        6.565388e-08
10240    147.12         577.43        5.203859e-08

cgetrf_gpu (C2050, yona@ornl)
  M     N   CPU GFlop/s    GPU GFlop/s   ||PA-LU||/(||A||*N)
============================================================
  960   960    6.55         106.02         3.071780e-09
 1920  1920   97.01         276.65         2.950013e-09
 3072  3072  115.96         399.54         2.903989e-09
 4032  4032  148.26         474.44         2.843912e-09
 4992  4992  157.70         516.64         2.809787e-09
 5952  5952  143.87         544.70         3.139649e-09
 7104  7104  166.62         566.15         3.650138e-09
 8064  8064  169.98         576.61         3.928518e-09
 9024  9024  151.16         588.98         4.347956e-09
 9984  9984  152.92         598.74         4.736127e-09

cgeqrf_gpu (C2050, yona@ornl)
  M     N   CPU GFlop/s   GPU GFlop/s    ||R||_F / ||A||_F
==========================================================
 1024  1024   53.18         127.88        1.363184e-06
 2048  2048  113.28         233.46        1.826993e-06
 3072  3072  149.63         398.12        2.241375e-06
 4032  4032  146.89         487.54        2.570974e-06
 5184  5184  158.60         580.78        2.980236e-06
 6016  6016  164.78         616.62        3.180960e-06
 7040  7040  158.09         643.77        3.285831e-06
 8064  8064  168.39         662.02        3.416155e-06
 9088  9088  171.96         676.46        3.489114e-06
 9984  9984  172.93         683.90        3.469477e-06

cungqr (C2050, yona@ornl)
  M     N    MAGMA CPU    MAGMA GPU    ||R|| / ||A||
=======================================================
 1024  1024    154.4        158.4         2.73e-07 
 2048  2048    148.6        288.1         2.86e-07 
 3072  3072    339.9        364.4         2.91e-07 
 4032  4032    464.2        474.8         2.89e-07 
 5184  5184    535.4        550.4         3.25e-07 
 6016  6016    598.3        537.8         3.54e-07 
 7040  7040    625.8        571.3         3.97e-07 
 8064  8064    636.9        615.8         4.73e-07 
 9088  9088    664.1        608.4         5.27e-07 
 9984  9984    668.7        636.6         6.09e-07 

//////////////////////////////////////////////////////////////////

dpotrf_gpu (C2050, disco@utk)
  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F
========================================================
 1024     20.15          49.77        7.367900e-17
 2048     28.56         111.06        9.213125e-17
 3072     31.30         154.37        9.958574e-17
 4032     30.25         177.38        1.017019e-16
 5184     33.39         195.69        1.042985e-16
 6048     33.86         210.62        1.073428e-16
 7200     32.48         220.59        1.060223e-16
 8064     33.48         231.46        1.072428e-16
 8928     33.51         236.39        1.104858e-16
10240     33.41         246.52        1.142382e-16

dgetrf_gpu (C2050, disco@utk)
  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F
========================================================
  960   960   12.20          30.59         4.186512e-18
 1920  1920   11.29          71.52         3.631333e-18
 3072  3072   11.17         120.90         4.081900e-18
 4032  4032   12.24         157.15         3.816437e-18
 4992  4992   11.98         181.97         3.648711e-18
 5952  5952   12.42         200.59         3.485279e-18
 7104  7104   12.23         215.87         3.407906e-18
 8064  8064   12.46         225.64         2.725883e-18
 9024  9024   12.02         233.94         2.636555e-18
 9984  9984   33.89         240.93         2.554841e-18

dgeqrf_gpu (C2050, disco@utk)
  M     N   CPU GFlop/s   GPU GFlop/s    ||R||_F / ||A||_F
==========================================================
 1024  1024   19.54          49.25        2.040039e-15
 2048  2048   13.79         110.97        2.662709e-15
 3072  3072   13.86         154.91        3.256163e-15
 4032  4032   14.15         185.46        3.546103e-15
 5184  5184   14.40         209.52        3.835442e-15
 6016  6016   14.41         220.62        4.208191e-15
 7040  7040   14.35         230.69        4.564374e-15
 8064  8064   14.19         238.10        4.898536e-15
 9088  9088   14.50         243.72        5.240898e-15
 9984  9984   14.37         247.45        5.322578e-15

dorgqr (C2050, disco@utk)
  M     N     MAGMA CPU    MAGMA GPU    ||R|| / ||A||
=======================================================
 1024  1024     67.4         97.8         1.06e-17 
 2048  2048    133.9        164.2         4.23e-18 
 3072  3072    141.8        204.3         4.56e-18 
 4032  4032    175.2        223.5         3.59e-18 
 5184  5184    202.4        238.1         2.93e-18 
 6016  6016    216.1        246.8         2.71e-18 
 7040  7040    228.0        253.3         2.50e-18 
 8064  8064    236.8        258.6         2.13e-18 
 9088  9088    243.6        262.5         1.93e-18 
 9984  9984    248.4        265.5         1.84e-18 

dgeqp3 (QR with pivoting, M2090, Keeneland)
  M     N   CPU GFlop/s (sec)   GPU GFlop/s (sec)  ||A*P - Q*R||_F  
==================================================================
 1024  1024   37.23 (  0.15)     19.97 (  0.29)     4.274258e-18
 2048  2048   54.87 (  0.84)     50.57 (  0.91)     3.985089e-18
 3072  3072   37.70 (  4.10)     75.44 (  2.05)     1.660469e-18
 4032  4032   34.95 ( 10.01)     95.23 (  3.67)     1.418927e-18
 5184  5184   34.36 ( 21.63)    112.58 (  6.60)     1.177798e-18
 6016  6016   33.54 ( 34.63)    122.05 (  9.52)     1.637618e-18
 7040  7040   34.61 ( 53.78)    133.30 ( 13.96)     1.310186e-18
 8064  8064   34.70 ( 80.61)    138.26 ( 20.23)     1.514179e-18
 9088  9088   33.07 (121.09)    146.74 ( 27.29)     7.416005e-19
 9984  9984   35.05 (151.47)    149.35 ( 35.54)     7.378708e-19

//////////////////////////////////////////////////////////////////

zpotrf_gpu (C2050, yona@ornl)
  N    CPU GFlop/s    GPU GFlop/s    ||R||_F / ||A||_F
========================================================
 1024     23.62          73.13        1.239284e-16
 2048     36.60         129.32        1.571613e-16
 3072     65.06         164.62        1.401342e-16
 4032     80.81         185.02        1.176092e-16
 5184     82.40         203.40        2.103705e-16
 6048     83.88         212.49        1.958623e-16
 7200     84.01         223.52        1.786711e-16
 8064     90.06         230.65        1.683656e-16
 8928     88.49         235.11        3.098039e-16
10240     47.51         242.72        2.866066e-16

zgetrf_gpu (C2050, yona@ornl)
  M     N   CPU GFlop/s    GPU GFlop/s   ||PA-LU||/(||A||*N)
============================================================
  960   960   12.57          69.77         5.809538e-18
 1920  1920   48.86         147.99         5.614021e-18
 3072  3072   48.03         180.04         5.382074e-18
 4032  4032   74.76         217.02         5.247905e-18
 4992  4992   80.06         224.91         5.152215e-18
 5952  5952   73.43         233.28         5.126963e-18
 7104  7104   84.26         241.50         5.069285e-18
 8064  8064   85.29         244.57         4.974459e-18
 9024  9024   77.30         248.35         4.960506e-18
 9984  9984   78.13         249.74         4.920588e-18

zgeqrf_gpu (C2050, yona@ornl)
  M     N   CPU GFlop/s   GPU GFlop/s    ||R||_F / ||A||_F
==========================================================
 1024  1024   38.96          60.66        2.727059e-15
 2112  2112   65.87         138.66        3.622483e-15
 3072  3072   66.98         177.39        4.248629e-15
 4032  4032   75.01         220.05        4.908036e-15
 5184  5184   80.78         238.15        5.533331e-15
 6016  6016   83.50         245.78        5.899060e-15
 7040  7040   81.11         252.01        6.200250e-15
 8064  8064   84.83         256.30        6.551392e-15
 9088  9088   86.57         259.38        6.831344e-15
 9984  9984   85.56         261.45        6.989375e-15

zungqr (C2050, yona@ornl)
  M     N    MAGMA CPU    MAGMA GPU    ||R|| / ||A||
=======================================================
 1024  1024     38.2         70.5         5.13e-16 
 2048  2048     30.1         96.4         5.15e-16 
 3072  3072    110.6        163.2         5.63e-16 
 4032  4032    223.4        232.0         5.96e-16 
 5184  5184    244.6        249.3         5.99e-16 
 6016  6016    251.0        233.0         5.81e-16 
 7040  7040    250.4        236.7         5.43e-16 
 8064  8064    254.4        246.9         5.56e-16 
 9088  9088    257.6        250.2         5.70e-16 
 9984  9984    259.8        255.2         5.85e-16 

////////////////////////////////////////////////////////////////// 
Solvers
//////////////////////////////////////////////////////////////////

zcgeqrsv_gpu -nrhs 1 (C2050, yonal@ornl)

Epsilon(double): 1.110223e-16
Epsilon(single): 5.960464e-08

           CPU GFlop/s                 GPU GFlop/s   
  N          Doule           Double    Single	      Mixed    ||b-Ax||/||A||   iter.
=====================================================================================
 1024	   24.92	    38.70	111.85	      38.89	2.072269e-15      2 
 2048      27.16            53.96	214.27       180.78     4.995612e-14      2 
 3072      47.10	   170.25	371.18       319.72	2.475057e-15      3 
 4032      55.32           215.71	456.34	     397.21	4.254556e-15      3 
 5184      62.60           236.98       554.02	     505.07     1.704509e-15      3 
 6016	   66.09	   246.92	590.75	     544.80	3.229644e-15      3 
 7040	   65.90           254.69	621.88	     578.67	2.742809e-15      3 

dsgesv_gpu -nrhs 1 (C2050, disco@utk)
  N   DP-Factor  DP-Solve  SP-Factor  SP-Solve  MP-Solve  ||b-Ax||/||A||  NumIter
==================================================================================
 1024   29.49     26.49     39.12      37.16      22.67     1.891330e-16      3
 2048   86.06     78.47    115.32     110.45      92.72     2.393012e-15      3
 3072  119.41    112.05    190.36     184.38     160.43     1.902408e-16      3
 4032  154.71    143.03    253.87     250.08     206.52     6.866680e-14      6
 5184  185.52    177.07    331.61     327.21     293.24     2.272523e-15      3
 6016  200.26    193.24    363.83     359.94     330.96     2.008029e-14      3
 7040  209.54    207.09    400.50     395.41     365.60     2.323560e-15      3
 8064  223.97    217.91    422.42     418.30     382.66     2.288920e-15      4

zcgesv_gpu -nrhs 1 (C2050, yona@ornl)
  N   DP-Factor  DP-Solve  SP-Factor  SP-Solve  MP-Solve  ||b-Ax||/||A||  NumIter
==================================================================================
 1024   50.24     37.64    109.41      73.37      33.87     4.098540e-15      2
 2048   61.12     51.26    258.99     171.75      82.08     8.620067e-15      3
 3072  174.80    128.75    404.98     268.91     129.83     6.486969e-16      3
 4032  207.72    160.45    475.29     323.24     138.31     3.068121e-16      4
 5184  227.69    178.97    523.97     380.93     205.23     2.986088e-15      3
 6016  239.47    192.24    560.27     416.18     228.43     7.679333e-15      3
 7040  244.82    201.64    578.22     443.28     224.78     1.509889e-16      4
 7520  246.27    204.81    576.43     448.08     231.34     9.861416e-16      4
 8064  247.67    208.12    587.78     461.54     242.98     2.889014e-16      4
 8192  215.34    185.20    491.08     400.60     225.92     2.884835e-16      4



dsposv_gpu -nrhs 1 (C2050, disco@utk)
  N   DP-Factor  DP-Solve  SP-Factor  SP-Solve  MP-Solve  ||b-Ax||/||A||  NumIter
==================================================================================
 1024   41.70     31.15     57.62      51.16      20.13     1.495555e-18      2
 2048  103.42     85.70    187.92     165.71     121.77     1.406742e-18      2
 3072  147.18    128.91    285.15     261.54     203.88     9.083634e-19      2
 4032  174.95    153.06    344.39     321.91     262.91     8.678348e-19      2
 5184  206.48    185.50    392.52     372.49     302.27     6.734822e-19      2
 6016  220.15    201.90    417.88     400.77     336.02     7.558627e-19      2
 7040  232.67    213.68    449.79     434.32     374.45     6.727643e-19      2
 8064  239.09    224.65    467.21     452.77     399.80     8.781449e-19      2

zcposv_gpu -nrhs 1 (C2050, yona@ornl)
  N   DP-Factor  DP-Solve  SP-Factor  SP-Solve  MP-Solve  ||b-Ax||/||A||  NumIter
==================================================================================
 1024   32.77     34.88    101.28      52.60      24.78     2.497296e-18      2
 2048  131.09     71.26    270.53     129.67      60.58     2.784614e-18      2
 3072  165.47     98.85    354.07     187.32      92.39     2.341482e-18      2
 4032  188.61    118.35    392.16     220.93     113.00     1.639098e-18      2
 5184  206.66    137.68    443.04     270.95     146.26     1.673193e-18      2
 6016  215.22    149.23    479.81     302.33     166.45     1.519763e-18      2
 7040  225.69    161.36    510.44     332.20     188.40     1.734973e-18      2
 8064  234.33    172.40    535.81     358.41     208.03     1.560348e-18      2

//////////////////////////////////////////////////////////////////
Two-sided factorizations
//////////////////////////////////////////////////////////////////

xgehrd (C2050, sd on disco@utk, cz on yona@ornl)
  N       CP      SP      ZP      DP
========================================
 1024    30.06   23.02   16.97   16.12
 2048    83.53   56.45   38.02   33.21
 3072   144.25   87.17   55.05   50.26
 4032   174.95  105.91   83.2    62.75
 5184   200.35  134.05   93.93   73.39
 6016   213.81  147.71   99.77   78.83
 7040   227.53  158.81  105.13   84.23
 8064   234.14  163.67  104.04   86.5
 9088   235.88  173.48  105.11   84.79
10112   238.47  176.21  107.19   83.23

xsytrd (C2050, sd on disco@utk, cz on yona@ornl)
/* BLAS used can be further optimized */
  N       CP       SP      ZP      DP
=========================================
 1024     9.55    18.93   10.36   11.03
 2048    27.51    19.72   13.42   13.13
 3072    34.54    31.28   20.31   18.69
 4032    32.72    40.27   21      22.53
 5184    38.44    52.51   23.65   26.2
 6016    39.22    58.23   24.18   28.1
 7040    39.92    64.18   24.44   29.75
 8064    38       64.64   24.23   30.74
 9088    40.91    72.92   24.87   31.92
10112    41.38    76.53   25.26   32.68

xgebrd
N	 CP	   SP	   ZP	    DP
==================================================
 1024	 12.45     8.36    7.58    6.66
 2048	 23.01    14.2	   9.94    12.32
 3072	 36.95    21.96	  18.89    17.7
 4032	 53.51	  39.24	  27.7	   25.37
 5184	 61.07    46.75	  31.27    28.04
 6016	 64.88    50.48	  31       27.97
 7040	 67.1     54.42   32.38    28.8
 8064	 67.14    55.19   32.25    29.01
 9088	 69.95    57.71   33.56    28.93
10112	 71.61    58.92   34.31    29
