/*
 *  -- micMAGMA (version 0.1) --
 *     Univ. of Tennessee, Knoxville
 *     Univ. of California, Berkeley
 *     Univ. of Colorado, Denver
 *     April 2012
 *
 **/

Performance results of the main	routines provided in micMAGMA 0.2

Hardware
========
MIC : KNC    --- 62 cores  @ 1 GHz
CPU : animal --- 4 x Six -Core Intel Xeon X5680         @ 3.33 GHz


Software
========
MIC : Alpha + SW Dev Tools 027 + MAGMA 0.2
CPU : MKL  11.1                + MAGMA 0.2

//////////////////////////////////////////////////////////////////

[tomov@rizzo magma_mic_sample]$ ./testing_spotrf 
scif_connect success

Usage: 
  testing_spotrf_gpu -N 1024

  N    GPU GFlop/s    ||R||_F / ||A||_F
==========================================
 1024	  34.22	        9.376754e-08
 2048	  69.39	        1.380465e-07
 3072	  102.90	8.476539e-08
 4032	  114.86	7.689565e-08
 5184	  140.48	1.322444e-07
 6048	  157.91	1.104898e-07
 7200	  171.01	1.001905e-07
 8064	  186.18	9.167686e-08
 8928	  187.65	1.637169e-07
10240	  211.25	1.530823e-07


[tomov@animal testing]$ ./testing_dgeqrf_mic 

Usage: 
  testing_dgeqrf_mic -M 1024 -N 1024

  M     N    CPU GFlop/s (sec)   GPU GFlop/s (sec)   ||R||_F / ||A||_F
======================================================================
 1024  1024    35.10 (  0.04)       1.39 (  1.03)       1.095769e-15
 2048  2048    76.81 (  0.15)       3.75 (  3.06)       1.106541e-15
 3072  3072   101.35 (  0.38)      27.04 (  1.43)       1.233043e-15
 4032  4032   107.91 (  0.81)      60.07 (  1.46)       1.246143e-15
 5184  5184   115.47 (  1.61)      81.82 (  2.27)       1.253971e-15
 6016  6016   118.55 (  2.45)      91.29 (  3.18)       1.251421e-15
 7040  7040   121.42 (  3.83)     110.66 (  4.21)       1.245907e-15
 8064  8064   122.89 (  5.69)     117.07 (  5.97)       1.305051e-15
 9088  9088   125.30 (  7.99)     131.58 (  7.61)       1.318396e-15
10176 10176   128.25 ( 10.96)     149.91 (  9.37)       1.333901e-15


[tomov@animal testing]$ ./testing_dpotrf_mic 

Usage: 
  testing_dpotrf_mic -N 1024

  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F
========================================================================================
 1024      79.64 (  0.00)       4.51 (  0.08)         1.704305e-16
 2048     110.43 (  0.03)      11.64 (  0.25)         1.652343e-16
 3072     124.19 (  0.08)      23.37 (  0.41)         1.383690e-16
 4032     128.74 (  0.17)      32.90 (  0.66)         1.139405e-16
 5184     132.58 (  0.35)      64.35 (  0.72)         1.891538e-16
 6048     133.43 (  0.55)      72.89 (  1.01)         1.753479e-16
 7200     134.38 (  0.93)      65.37 (  1.90)         1.580407e-16
 8064     134.96 (  1.30)      97.64 (  1.79)         1.464134e-16
 8928     135.07 (  1.76)      98.84 (  2.40)         2.546458e-16
10560     139.19 (  2.82)     117.40 (  3.34)         2.309352e-16

syrk on CPU
[tomov@animal testing]$ ./testing_dpotrf_mic

Usage: 
  testing_dpotrf_mic -N 1024

  N    CPU GFlop/s (sec)    GPU GFlop/s (sec)    ||R_magma-R_lapack||_F / ||R_lapack||_F
========================================================================================
 1024      62.61 (  0.01)       9.31 (  0.04)         1.741123e-16
 2048     109.51 (  0.03)       3.01 (  0.95)         1.681741e-16
 3072     122.70 (  0.08)      25.30 (  0.38)         1.424833e-16
 4032     127.90 (  0.17)      55.61 (  0.39)         1.198715e-16
 5184     131.90 (  0.35)     101.45 (  0.46)         2.194494e-16
 6048     132.31 (  0.56)     115.32 (  0.64)         1.985714e-16
 7200     134.58 (  0.92)     131.59 (  0.95)         1.645309e-16
 8064     134.91 (  1.30)     117.39 (  1.49)         1.529488e-16
 8928     135.67 (  1.75)     160.04 (  1.48)         2.699703e-16
10560     139.82 (  2.81)     180.60 (  2.17)         2.360442e-16


@10560
       M         N            K      T (sec)           GFlop    GFlop/s 
dgemm: 256	 10048	      256    0.510000	      1.317011	2.582375
dgemm: 256	 9792	      512    0.010000	      2.566914	256.691650
dgemm: 256	 9536	      768    0.020000	      3.749708	187.485568
dgemm: 256	 9280	      1024   0.020000	      4.865393	243.269864
dgemm: 256	 9024	      1280   0.030000	      5.913969	197.132476
dgemm: 256	 8768	      1536   0.020000	      6.895436	344.772118
dgemm: 256	 8512	      1792   0.030000	      7.809794	260.326717
dgemm: 256	 8256	      2048   0.030000	      8.657043	288.568390
dgemm: 256	 8000	      2304   0.040000	      9.437184	235.928419
dgemm: 256	 7744	      2560   0.030000	      10.150216	338.340845
dgemm: 256	 7488	      2816   0.040000	      10.796138	269.903720
dgemm: 256	 7232	      3072   0.040000	      11.374952	284.374082
dgemm: 256	 6976	      3328   0.040000	      11.886658	297.164951
dgemm: 256	 6720	      3584   0.040000	      12.331254	308.281638
dgemm: 256	 6464	      3840   0.050000	      12.708741	254.175065
dgemm: 256	 6208	      4096   0.050000	      13.019120	260.382641
dgemm: 256	 5952	      4352   0.050000	      13.262389	265.246773
dgemm: 256	 5696	      4608   0.050000	      13.438550	268.771257
dgemm: 256	 5440	      4864   0.060000	      13.547602	225.793581
dgemm: 256	 5184	      5120   0.050000	      13.589545	271.791158
dgemm: 256	 4928	      5376   0.050000	      13.564379	271.286548
dgemm: 256	 4672	      5632   0.060000	      13.472104	224.535288
dgemm: 256	 4416	      5888   0.050000	      13.312721	266.254672
dgemm: 256	 4160	      6144   0.050000	      13.086228	261.724819
dgemm: 256	 3904	      6400   0.040000	      12.792627	319.815985
dgemm: 256	 3648	      6656   0.050000	      12.431917	248.638578
dgemm: 256	 3392	      6912   0.040000	      12.004098	300.102737
dgemm: 256	 3136	      7168   0.050000	      11.509170	230.183623
dgemm: 256	 2880	      7424   0.040000	      10.947133	273.676966
dgemm: 256	 2624	      7680   0.040000	      10.317988	257.949942
dgemm: 256	 2368	      7936   0.040000	      9.621733	240.543564
dgemm: 256	 2112	      8192   0.050000	      8.858370	177.167570
dgemm: 256	 1856	      8448   0.040000	      8.027898	200.696442
dgemm: 256	 1600	      8704   0.030000	      7.130317	237.677453
dgemm: 256	 1344	      8960   0.030000	      6.165627	205.521092
dgemm: 256	 1088	      9216   0.030000	      5.133828	171.127766
dgemm: 256	 832	      9472   0.030000	      4.034920	134.497477
dgemm: 256	 576	      9728   0.030000	      2.868904	95.630222
dgemm: 256	 320	      9984   0.030000	      1.635779	54.526004
dgemm: 256	 64	      10240  0.020000	      0.335544	16.777232

dtrsm: ?	 256	      10304  0.020000	      0.675283	33.764179
dtrsm: ?	 256	      10048  0.020000	      0.658506	32.925318
dtrsm: ?	 256	      9792   0.020000	      0.641729	32.086456
dtrsm: ?	 256	      9536   0.020000	      0.624951	31.247595
dtrsm: ?	 256	      9280   0.020000	      0.608174	30.408371
dtrsm: ?	 256	      9024   0.020000	      0.591397	29.569871
dtrsm: ?	 256	      8768   0.020000	      0.574620	28.731010
dtrsm: ?	 256	      8512   0.020000	      0.557842	27.892148
dtrsm: ?	 256	      8256   0.020000	      0.541065	27.053287
dtrsm: ?	 256	      8000   0.020000	      0.524288	26.214113
dtrsm: ?	 256	      7744   0.020000	      0.507511	25.375563
dtrsm: ?	 256	      7488   0.020000	      0.490734	24.536702
dtrsm: ?	 256	      7232   0.020000	      0.473956	23.697840
dtrsm: ?	 256	      6976   0.020000	      0.457179	22.858706
dtrsm: ?	 256	      6720   0.010000	      0.440402	44.040234
dtrsm: ?	 256	      6464   0.010000	      0.423625	42.362511
dtrsm: ?	 256	      6208   0.020000	      0.406847	20.342394
dtrsm: ?	 256	      5952   0.010000	      0.390070	39.006134
dtrsm: ?	 256	      5696   0.020000	      0.373293	18.664671
dtrsm: ?	 256	      5440   0.020000	      0.356516	17.825809
dtrsm: ?	 256	      5184   0.010000	      0.339739	33.973895
dtrsm: ?	 256	      4928   0.010000	      0.322961	32.296172
dtrsm: ?	 256	      4672   0.020000	      0.306184	15.309224
dtrsm: ?	 256	      4416   0.010000	      0.289407	28.940725
dtrsm: ?	 256	      4160   0.010000	      0.272630	27.263002
dtrsm: ?	 256	      3904   0.010000	      0.255853	25.585279
dtrsm: ?	 256	      3648   0.010000	      0.239075	23.907556
dtrsm: ?	 256	      3392   0.010000	      0.222298	22.229832
dtrsm: ?	 256	      3136   0.010000	      0.205521	20.552109
dtrsm: ?	 256	      2880   0.010000	      0.188744	18.874386
dtrsm: ?	 256	      2624   0.010000	      0.171966	17.196663
dtrsm: ?	 256	      2368   0.010000	      0.155189	15.5189
dtrsm: ?	 256	      2112   0.010000	      0.138412	13.841216
dtrsm: ?	 256	      1856   0.010000	      0.121635	12.163493
dtrsm: ?	 256	      1600   0.010000	      0.104858	10.485770
dtrsm: ?	 256	      1344   0.010000	      0.088080	8.808
dtrsm: ?	 256	      1088   0.010000	      0.071303	7.130324
dtrsm: ?	 256	      832    0.010000	      0.054526	5.452600
dtrsm: ?	 256	      576    0.010000	      0.037749	3.7749
dtrsm: ?	 256	      320    0.010000	      0.020972	2.097154
dtrsm: ?	 256	      64     0.010000	      0.004194	0.419431
