9 __asm__ __volatile__(
"movaps (%0), %%xmm0;"
10 "movaps (%1), %%xmm1;"
11 "addps %%xmm0, %%xmm1;"
12 "movaps %%xmm1, (%2);"::
"r"(
aa ),
19 __asm__ __volatile__(
"movaps (%0), %%xmm0;"
20 "movaps (%1), %%xmm1;"
21 "mulps %%xmm0, %%xmm1;"
22 "movaps %%xmm1, (%2);"::
"r"(
aa ),
29 __asm__ __volatile__(
"movapd (%0), %%xmm0;"
30 "movapd (%1), %%xmm1;"
31 "addpd %%xmm0, %%xmm1;"
32 "movapd %%xmm1, (%2);"::
"r"(
aa ),
39 __asm__ __volatile__(
"movapd (%0), %%xmm0;"
40 "movapd (%1), %%xmm1;"
41 "mulpd %%xmm0, %%xmm1;"
42 "movapd %%xmm1, (%2);"::
"r"(
aa ),
49 __asm__ __volatile__(
"movss (%0), %%xmm0;"
51 "addss %%xmm0, %%xmm1;"
52 "movss %%xmm1, (%2);"::
"r"(
aa ),
"r"(
bb ),
"r"( cc )
58 __asm__ __volatile__(
"movss (%0), %%xmm0;"
60 "mulss %%xmm0, %%xmm1;"
61 "movss %%xmm1, (%2);"::
"r"(
aa ),
"r"(
bb ),
"r"( cc )
67 __asm__ __volatile__(
"movsd (%0), %%xmm0;"
69 "addsd %%xmm0, %%xmm1;"
70 "movsd %%xmm1, (%2);"::
"r"(
aa ),
"r"(
bb ),
"r"( cc )
76 __asm__ __volatile__(
"movsd (%0), %%xmm0;"
78 "mulsd %%xmm0, %%xmm1;"
79 "movsd %%xmm1, (%2);"::
"r"(
aa ),
"r"(
bb ),
"r"( cc )
84main(
int argc,
char **argv )
86 int i, packed = 0, sse = 0;
87 float a[4] = { 1.0, 2.0, 3.0, 4.0 };
88 float b[4] = { 2.0, 3.0, 4.0, 5.0 };
89 float c[4] = { 0.0, 0.0, 0.0, 0.0 };
90 double d[4] = { 1.0, 2.0, 3.0, 4.0 };
91 double e[4] = { 2.0, 3.0, 4.0, 5.0 };
92 double f[4] = { 0.0, 0.0, 0.0, 0.0 };
96 printf(
"Usage %s: <packed|unpacked> <sse|sse2>\n", argv[0] );
99 if ( strcasecmp( argv[1],
"packed" ) == 0 )
101 else if ( strcasecmp( argv[1],
"unpacked" ) == 0 )
105 if ( strcasecmp( argv[2],
"sse" ) == 0 )
107 else if ( strcasecmp( argv[2],
"sse2" ) == 0 )
114 ( system(
"cat /proc/cpuinfo | grep sse > /dev/null" ) != 0 ) ) {
115 printf(
"This processor does not have SSE.\n" );
119 ( system(
"cat /proc/cpuinfo | grep sse2 > /dev/null" ) != 0 ) ) {
120 printf(
"This processor does not have SSE2.\n" );
125 printf(
"Vector 1: %f %f %f %f\n",
a[0],
a[1],
a[2],
a[3] );
126 printf(
"Vector 2: %f %f %f %f\n\n",
b[0],
b[1],
b[2],
b[3] );
128 if ( ( packed == 0 ) && ( sse == 1 ) ) {
132 printf(
"%d SSE Unpacked Adds: Result %f\n",
NUMBER,
c[0] );
137 printf(
"%d SSE Unpacked Muls: Result %f\n",
NUMBER,
c[0] );
139 if ( ( packed == 1 ) && ( sse == 1 ) ) {
143 printf(
"%d SSE Packed Adds: Result %f %f %f %f\n",
NUMBER,
c[0],
c[1],
148 printf(
"%d SSE Packed Muls: Result %f %f %f %f\n",
NUMBER,
c[0],
c[1],
152 if ( ( packed == 0 ) && ( sse == 0 ) ) {
156 printf(
"%d SSE2 Unpacked Adds: Result %f\n",
NUMBER,
c[0] );
161 printf(
"%d SSE2 Unpacked Muls: Result %f\n",
NUMBER,
c[0] );
163 if ( ( packed == 1 ) && ( sse == 0 ) ) {
167 printf(
"%d SSE2 Packed Adds: Result %f\n",
NUMBER,
c[0] );
172 printf(
"%d SSE2 Packed Muls: Result %f\n",
NUMBER,
c[0] );
static double a[MATRIX_SIZE][MATRIX_SIZE]
static double b[MATRIX_SIZE][MATRIX_SIZE]
static double c[MATRIX_SIZE][MATRIX_SIZE]
void inline_packed_sse_add(float *aa, float *bb, float *cc)
void inline_unpacked_sse2_mul(double *aa, double *bb, double *cc)
void inline_packed_sse2_add(double *aa, double *bb, double *cc)
void inline_unpacked_sse2_add(double *aa, double *bb, double *cc)
void inline_unpacked_sse_mul(float *aa, float *bb, float *cc)
void inline_packed_sse_mul(float *aa, float *bb, float *cc)
void inline_packed_sse2_mul(double *aa, double *bb, double *cc)
void inline_unpacked_sse_add(float *aa, float *bb, float *cc)