00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015 #ifndef MTL_OPTERON_MATRIX_MULT_INCLUDE
00016 #define MTL_OPTERON_MATRIX_MULT_INCLUDE
00017
00018 #if defined MTL_USE_OPTERON_OPTIMIZATION && defined __GNUC__ && !defined __INTEL_COMPILER
00019
00020 #include <boost/numeric/mtl/operation/assign_mode.hpp>
00021 #include <boost/numeric/mtl/operation/set_to_zero.hpp>
00022 #include <boost/numeric/mtl/recursion/bit_masking.hpp>
00023
00024
00025 namespace mtl {
00026
00027 namespace detail {
00028 template <unsigned long MaskA, unsigned long MaskB, unsigned long MaskC>
00029 struct opteron_shark_teeth
00030 {
00031 static const unsigned long base_case_bits= 5, tooth_length = 1;
00032 static const bool value= is_k_power_base_case_row_major_t_shark<base_case_bits, tooth_length, MaskA>::value
00033 && is_k_power_base_case_col_major_t_shark<base_case_bits, tooth_length, MaskB>::value
00034 && is_k_power_base_case_row_major_t_shark<base_case_bits, tooth_length, MaskC>::value;
00035 };
00036
00037 template <typename Assign, unsigned long MaskA, typename PA,
00038 unsigned long MaskB, typename PB,
00039 unsigned long MaskC, typename PC>
00040 inline void
00041 opteron_shark_teeth_mult(const morton_dense<double, MaskA, PA>& a, const morton_dense<double, MaskB, PB>& b,
00042 morton_dense<double, MaskC, PC>& c)
00043 {
00044
00045
00046 typedef typename morton_dense<double, MaskA, PA>::size_type size_type;
00047 size_type i_max= c.num_rows(), i_block= 2 * (i_max / 2),
00048 j_max= c.num_cols(), j_block= 2 * (j_max / 2),
00049 k_max= a.num_cols();
00050 const int stride= 32;
00051
00052 double *ap= &const_cast<morton_dense<double, MaskA, PA>&>(a)[0][0],
00053 *bp= &const_cast<morton_dense<double, MaskB, PB>&>(b)[0][0], *cp= &c[0][0];
00054
00055
00056 for (size_type i= 0; i < i_block; i+=2)
00057 for (int j = 0; j < j_block; j+=2) {
00058 double tmp00= 0.0, tmp01= 0.0, tmp10= 0.0, tmp11= 0.0;
00059 for (int k = 0; k < k_max; k++) {
00060 tmp00 += ap[0+(i)*stride+2*k] * bp[0+(j)*stride+2*k];
00061 tmp01 += ap[0+(i)*stride+2*k] * bp[1+(j)*stride+2*k];
00062 tmp10 += ap[1+(i)*stride+2*k] * bp[0+(j)*stride+2*k];
00063 tmp11 += ap[1+(i)*stride+2*k] * bp[1+(j)*stride+2*k];
00064 }
00065 Assign::update(cp[0+(i)*stride+2*(j+0)], tmp00);
00066 Assign::update(cp[0+(i)*stride+2*(j+1)], tmp01);
00067 Assign::update(cp[1+(i)*stride+2*(j+0)], tmp10);
00068 Assign::update(cp[1+(i)*stride+2*(j+1)], tmp11);
00069 }
00070
00071
00072 for (size_type i= 0; i < i_block; i+=2)
00073 for (int j = j_block; j < j_max; j++) {
00074 double tmp00= 0.0, tmp10= 0.0;
00075 for (int k = 0; k < k_max; k++) {
00076 tmp00 += ap[0+(i)*stride+2*k] * bp[0+(j)*stride+2*k];
00077 tmp10 += ap[1+(i)*stride+2*k] * bp[0+(j)*stride+2*k];
00078 }
00079 Assign::update(cp[0+(i)*stride+2*(j+0)], tmp00);
00080 Assign::update(cp[1+(i)*stride+2*(j+0)], tmp10);
00081 }
00082
00083
00084 for (size_type i= i_block; i < i_max; i++)
00085 for (int j = 0; j < j_max; j++) {
00086 double tmp00= 0.0;
00087 for (int k = 0; k < k_max; k++)
00088 tmp00 += ap[0+(i)*stride+2*k] * bp[0+(j)*stride+2*k];
00089 Assign::update(cp[0+(i)*stride+2*(j+0)], tmp00);
00090 }
00091 }
00092
00093 }
00094
00095
00096
00097 template <unsigned long MaskA, typename PA,
00098 unsigned long MaskB, typename PB,
00099 unsigned long MaskC, typename PC,
00100 typename Assign, typename Backup>
00101 struct gen_platform_dmat_dmat_mult_ft<morton_dense<double, MaskA, PA>, morton_dense<double, MaskB, PB>,
00102 morton_dense<double, MaskC, PC>, Assign, Backup>
00103 {
00104 void mult_ass(double * D, double * C, double * BT) const;
00105
00106 void operator()(const morton_dense<double, MaskA, PA>& a, const morton_dense<double, MaskB, PB>& b,
00107 morton_dense<double, MaskC, PC>& c) const
00108 {
00109
00110
00111 if (detail::opteron_shark_teeth<MaskA, MaskB, MaskC>::value) {
00112 if (Assign::init_to_zero)
00113 set_to_zero(c);
00114 if (a.num_rows() == 32 && a.num_cols() == 32 && b.num_cols() == 32) {
00115 double *ap= const_cast<morton_dense<double, MaskA, PA>&>(a).elements(),
00116 *bp= const_cast<morton_dense<double, MaskB, PB>&>(b).elements(), cp= &c.elements();
00117 mult_ass(cp, ap, bp);
00118 } else
00119 detail::opteron_shark_teeth_mult<Assign>(a, b, c);
00120 return;
00121 }
00122 Backup()(a, b, c);
00123 }
00124 };
00125
00126
00127
00128 template <unsigned long MaskA, typename PA,
00129 unsigned long MaskB, typename PB,
00130 unsigned long MaskC, typename PC,
00131 typename Backup>
00132 struct gen_platform_dmat_dmat_mult_ft<morton_dense<double, MaskA, PA>, morton_dense<double, MaskB, PB>,
00133 morton_dense<double, MaskC, PC>, assign::minus_sum, Backup>
00134 {
00135 void mult_ass(double * D, double * C, double * BT) const;
00136
00137 void operator()(const morton_dense<double, MaskA, PA>& a, const morton_dense<double, MaskB, PB>& b,
00138 morton_dense<double, MaskC, PC>& c) const
00139 {
00140
00141
00142 if (detail::opteron_shark_teeth<MaskA, MaskB, MaskC>::value) {
00143 if (a.num_rows() == 32 && a.num_cols() == 32 && b.num_cols() == 32) {
00144 double ap= &const_cast<morton_dense<double, MaskA, PA>&>(a).elements(),
00145 bp= &const_cast<morton_dense<double, MaskB, PB>&>(b).elements(), cp= &c.elements();
00146 mult_ass(cp, ap, bp);
00147 } else
00148 detail::opteron_shark_teeth_mult<assign::minus_sum>(a, b, c);
00149 return;
00150 }
00151 Backup()(a, b, c);
00152 }
00153 };
00154
00155
00156
00157
00158
00159 template <unsigned long MaskA, typename PA,
00160 unsigned long MaskB, typename PB,
00161 unsigned long MaskC, typename PC,
00162 typename Assign, typename Backup>
00163 void gen_platform_dmat_dmat_mult_ft<morton_dense<double, MaskA, PA>, morton_dense<double, MaskB, PB>,
00164 morton_dense<double, MaskC, PC>, Assign, Backup>::
00165 mult_ass(double * D, double * C, double * BT) const
00166 {
00167
00168 const int baseOrder= 32,
00169 stride = baseOrder;
00170
00171
00172
00173
00174
00175
00176
00177 #if 0
00178 for (int i = 0; i < baseOrder; i+=2)
00179 for (int j = 0; j < baseOrder; j+=2)
00180 for (int k = 0; k < baseOrder; k++)
00181 {
00182 D[0+(i)*stride+2*(j+0)] += C[0+(i)*stride+2*k] * BT[0+(j)*stride+2*k];
00183 D[0+(i)*stride+2*(j+1)] += C[0+(i)*stride+2*k] * BT[1+(j)*stride+2*k];
00184 D[1+(i)*stride+2*(j+0)] += C[1+(i)*stride+2*k] * BT[0+(j)*stride+2*k];
00185 D[1+(i)*stride+2*(j+1)] += C[1+(i)*stride+2*k] * BT[1+(j)*stride+2*k];
00186 }
00187 #endif
00188
00189 #if 0
00190
00191 for (int j = 0; j < baseOrder; j+=2)
00192 for (int i = 0; i < baseOrder; i+=16)
00193 {
00194 for (int k = 0; k < baseOrder; k++)
00195 {
00196 for (int i2 = i; i2 < i+16; i2+=2)
00197 {
00198 D[0+(i2)*stride+2*(j+0)] += C[0+(i2)*stride+2*k] * BT[0+(j)*stride+2*k];
00199 D[1+(i2)*stride+2*(j+0)] += C[1+(i2)*stride+2*k] * BT[0+(j)*stride+2*k];
00200 }
00201 }
00202 for (int k = 0; k < baseOrder; k++)
00203 {
00204 for (int i2 = i; i2 < i+16; i2+=2)
00205 {
00206 D[0+(i2)*stride+2*(j+1)] += C[0+(i2)*stride+2*k] * BT[1+(j)*stride+2*k];
00207 D[1+(i2)*stride+2*(j+1)] += C[1+(i2)*stride+2*k] * BT[1+(j)*stride+2*k];
00208 }
00209 }
00210 }
00211 #endif
00212
00213 #if 0
00214
00215
00216 for (int j = 0; j < baseOrder; j+=2)
00217 for (int i = 0; i < baseOrder; i+=16)
00218 {
00219 for (int k = 0; k < baseOrder; k++)
00220 {
00221 D[0+(i+ 0)*stride+2*(j+0)]+=C[0+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00222 D[1+(i+ 0)*stride+2*(j+0)]+=C[1+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00223 D[0+(i+ 2)*stride+2*(j+0)]+=C[0+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00224 D[1+(i+ 2)*stride+2*(j+0)]+=C[1+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00225 D[0+(i+ 4)*stride+2*(j+0)]+=C[0+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00226 D[1+(i+ 4)*stride+2*(j+0)]+=C[1+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00227 D[0+(i+ 6)*stride+2*(j+0)]+=C[0+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00228 D[1+(i+ 6)*stride+2*(j+0)]+=C[1+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00229 D[0+(i+ 8)*stride+2*(j+0)]+=C[0+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00230 D[1+(i+ 8)*stride+2*(j+0)]+=C[1+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00231 D[0+(i+10)*stride+2*(j+0)]+=C[0+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00232 D[1+(i+10)*stride+2*(j+0)]+=C[1+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00233 D[0+(i+12)*stride+2*(j+0)]+=C[0+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00234 D[1+(i+12)*stride+2*(j+0)]+=C[1+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00235 D[0+(i+14)*stride+2*(j+0)]+=C[0+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00236 D[1+(i+14)*stride+2*(j+0)]+=C[1+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00237 }
00238 for (int k = 0; k < baseOrder; k++)
00239 {
00240 D[0+(i+ 0)*stride+2*(j+1)]+=C[0+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
00241 D[1+(i+ 0)*stride+2*(j+1)]+=C[1+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
00242 D[0+(i+ 2)*stride+2*(j+1)]+=C[0+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
00243 D[1+(i+ 2)*stride+2*(j+1)]+=C[1+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
00244 D[0+(i+ 4)*stride+2*(j+1)]+=C[0+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
00245 D[1+(i+ 4)*stride+2*(j+1)]+=C[1+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
00246 D[0+(i+ 6)*stride+2*(j+1)]+=C[0+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
00247 D[1+(i+ 6)*stride+2*(j+1)]+=C[1+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
00248 D[0+(i+ 8)*stride+2*(j+1)]+=C[0+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
00249 D[1+(i+ 8)*stride+2*(j+1)]+=C[1+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
00250 D[0+(i+10)*stride+2*(j+1)]+=C[0+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
00251 D[1+(i+10)*stride+2*(j+1)]+=C[1+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
00252 D[0+(i+12)*stride+2*(j+1)]+=C[0+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
00253 D[1+(i+12)*stride+2*(j+1)]+=C[1+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
00254 D[0+(i+14)*stride+2*(j+1)]+=C[0+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
00255 D[1+(i+14)*stride+2*(j+1)]+=C[1+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
00256 }
00257 }
00258 #endif
00259
00260 #if 0
00261
00262 for (int j = 0; j < baseOrder; j+=2)
00263 for (int i = 0; i < baseOrder; i+=16)
00264 {
00265 {
00266 double d00 = D[0+(i+ 0)*stride+2*(j+0)];
00267 double d01 = D[1+(i+ 0)*stride+2*(j+0)];
00268 double d02 = D[0+(i+ 2)*stride+2*(j+0)];
00269 double d03 = D[1+(i+ 2)*stride+2*(j+0)];
00270 double d04 = D[0+(i+ 4)*stride+2*(j+0)];
00271 double d05 = D[1+(i+ 4)*stride+2*(j+0)];
00272 double d06 = D[0+(i+ 6)*stride+2*(j+0)];
00273 double d07 = D[1+(i+ 6)*stride+2*(j+0)];
00274 double d08 = D[0+(i+ 8)*stride+2*(j+0)];
00275 double d09 = D[1+(i+ 8)*stride+2*(j+0)];
00276 double d10 = D[0+(i+10)*stride+2*(j+0)];
00277 double d11 = D[1+(i+10)*stride+2*(j+0)];
00278 double d12 = D[0+(i+12)*stride+2*(j+0)];
00279 double d13 = D[1+(i+12)*stride+2*(j+0)];
00280 double d14 = D[0+(i+14)*stride+2*(j+0)];
00281 double d15 = D[1+(i+14)*stride+2*(j+0)];
00282 for (int k = 0; k < baseOrder; k++)
00283 {
00284 d00+=C[0+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00285 d01+=C[1+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00286 d02+=C[0+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00287 d03+=C[1+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00288 d04+=C[0+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00289 d05+=C[1+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00290 d06+=C[0+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00291 d07+=C[1+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00292 d08+=C[0+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00293 d09+=C[1+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00294 d10+=C[0+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00295 d11+=C[1+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00296 d12+=C[0+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00297 d13+=C[1+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00298 d14+=C[0+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00299 d15+=C[1+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00300 }
00301 D[0+(i+ 0)*stride+2*(j+0)] = d00;
00302 D[1+(i+ 0)*stride+2*(j+0)] = d01;
00303 D[0+(i+ 2)*stride+2*(j+0)] = d02;
00304 D[1+(i+ 2)*stride+2*(j+0)] = d03;
00305 D[0+(i+ 4)*stride+2*(j+0)] = d04;
00306 D[1+(i+ 4)*stride+2*(j+0)] = d05;
00307 D[0+(i+ 6)*stride+2*(j+0)] = d06;
00308 D[1+(i+ 6)*stride+2*(j+0)] = d07;
00309 D[0+(i+ 8)*stride+2*(j+0)] = d08;
00310 D[1+(i+ 8)*stride+2*(j+0)] = d09;
00311 D[0+(i+10)*stride+2*(j+0)] = d10;
00312 D[1+(i+10)*stride+2*(j+0)] = d11;
00313 D[0+(i+12)*stride+2*(j+0)] = d12;
00314 D[1+(i+12)*stride+2*(j+0)] = d13;
00315 D[0+(i+14)*stride+2*(j+0)] = d14;
00316 D[1+(i+14)*stride+2*(j+0)] = d15;
00317 }
00318
00319 {
00320 double d00 = D[0+(i+ 0)*stride+2*(j+1)];
00321 double d01 = D[1+(i+ 0)*stride+2*(j+1)];
00322 double d02 = D[0+(i+ 2)*stride+2*(j+1)];
00323 double d03 = D[1+(i+ 2)*stride+2*(j+1)];
00324 double d04 = D[0+(i+ 4)*stride+2*(j+1)];
00325 double d05 = D[1+(i+ 4)*stride+2*(j+1)];
00326 double d06 = D[0+(i+ 6)*stride+2*(j+1)];
00327 double d07 = D[1+(i+ 6)*stride+2*(j+1)];
00328 double d08 = D[0+(i+ 8)*stride+2*(j+1)];
00329 double d09 = D[1+(i+ 8)*stride+2*(j+1)];
00330 double d10 = D[0+(i+10)*stride+2*(j+1)];
00331 double d11 = D[1+(i+10)*stride+2*(j+1)];
00332 double d12 = D[0+(i+12)*stride+2*(j+1)];
00333 double d13 = D[1+(i+12)*stride+2*(j+1)];
00334 double d14 = D[0+(i+14)*stride+2*(j+1)];
00335 double d15 = D[1+(i+14)*stride+2*(j+1)];
00336 for (int k = 0; k < baseOrder; k++)
00337 {
00338 d00+=C[0+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
00339 d01+=C[1+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
00340 d02+=C[0+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
00341 d03+=C[1+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
00342 d04+=C[0+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
00343 d05+=C[1+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
00344 d06+=C[0+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
00345 d07+=C[1+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
00346 d08+=C[0+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
00347 d09+=C[1+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
00348 d10+=C[0+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
00349 d11+=C[1+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
00350 d12+=C[0+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
00351 d13+=C[1+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
00352 d14+=C[0+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
00353 d15+=C[1+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
00354 }
00355 D[0+(i+ 0)*stride+2*(j+1)] = d00;
00356 D[1+(i+ 0)*stride+2*(j+1)] = d01;
00357 D[0+(i+ 2)*stride+2*(j+1)] = d02;
00358 D[1+(i+ 2)*stride+2*(j+1)] = d03;
00359 D[0+(i+ 4)*stride+2*(j+1)] = d04;
00360 D[1+(i+ 4)*stride+2*(j+1)] = d05;
00361 D[0+(i+ 6)*stride+2*(j+1)] = d06;
00362 D[1+(i+ 6)*stride+2*(j+1)] = d07;
00363 D[0+(i+ 8)*stride+2*(j+1)] = d08;
00364 D[1+(i+ 8)*stride+2*(j+1)] = d09;
00365 D[0+(i+10)*stride+2*(j+1)] = d10;
00366 D[1+(i+10)*stride+2*(j+1)] = d11;
00367 D[0+(i+12)*stride+2*(j+1)] = d12;
00368 D[1+(i+12)*stride+2*(j+1)] = d13;
00369 D[0+(i+14)*stride+2*(j+1)] = d14;
00370 D[1+(i+14)*stride+2*(j+1)] = d15;
00371 }
00372 }
00373 #endif
00374
00375 #if 0
00376
00377 for (int j = 0; j < baseOrder; j+=2)
00378 for (int i = 0; i < baseOrder; i+=16)
00379 {
00380 {
00381 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]);
00382 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]);
00383 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]);
00384 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]);
00385 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]);
00386 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]);
00387 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]);
00388 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]);
00389 for (int k = 0; k < baseOrder; k++)
00390 {
00391 __m128d bt0 = _mm_load1_pd(&BT[0+j*stride+2*k]);
00392 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0;
00393 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0;
00394 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0;
00395 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0;
00396 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0;
00397 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0;
00398 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0;
00399 d14+=_mm_load_pd(&C[0+(i+14)*stride+2*k])*bt0;
00400 }
00401 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00);
00402 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02);
00403 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04);
00404 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06);
00405 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08);
00406 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10);
00407 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12);
00408 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14);
00409 }
00410
00411 {
00412 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]);
00413 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]);
00414 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]);
00415 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]);
00416 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]);
00417 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]);
00418 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]);
00419 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]);
00420 for (int k = 0; k < baseOrder; k++)
00421 {
00422 __m128d bt0 = _mm_load1_pd(&BT[1+j*stride+2*k]);
00423 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0;
00424 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0;
00425 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0;
00426 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0;
00427 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0;
00428 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0;
00429 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0;
00430 d14+=_mm_load_pd(&C[0+(i+14)*stride+2*k])*bt0;
00431 }
00432 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00);
00433 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02);
00434 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04);
00435 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06);
00436 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08);
00437 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10);
00438 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12);
00439 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14);
00440 }
00441 }
00442 #endif
00443
00444 #if 0
00445
00446 #define MM_LOAD1_PD(a,b) \
00447 { \
00448 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
00449 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
00450 }
00451 #define MM_LOAD1U_PD(a,b) \
00452 { \
00453 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
00454 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
00455 }
00456 #define MM_MUL_PD(out,addr) \
00457 { out = _mm_mul_pd(out, *(__m128d*)addr); }
00458 for (int j = 0; j < baseOrder; j+=2)
00459 for (int i = 0; i < baseOrder; i+=16)
00460 {
00461 {
00462 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]);
00463 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]);
00464 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]);
00465 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]);
00466 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]);
00467 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]);
00468 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]);
00469 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]);
00470 for (int k = 0; k < baseOrder; k++)
00471 {
00472 __m128d bt0;
00473 MM_LOAD1_PD(bt0, &BT[0+j*stride+2*k]);
00474 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0;
00475 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0;
00476 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0;
00477 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0;
00478 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0;
00479 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0;
00480 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0;
00481 MM_MUL_PD(bt0, &C[0+(i+14)*stride+2*k]);
00482 d14+=bt0;
00483 }
00484 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00);
00485 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02);
00486 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04);
00487 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06);
00488 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08);
00489 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10);
00490 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12);
00491 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14);
00492 }
00493
00494 {
00495 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]);
00496 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]);
00497 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]);
00498 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]);
00499 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]);
00500 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]);
00501 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]);
00502 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]);
00503 for (int k = 0; k < baseOrder; k++)
00504 {
00505 __m128d bt1;
00506 MM_LOAD1U_PD(bt1, &BT[1+j*stride+2*k]);
00507 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt1;
00508 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt1;
00509 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt1;
00510 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt1;
00511 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt1;
00512 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt1;
00513 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt1;
00514 MM_MUL_PD(bt1, &C[0+(i+14)*stride+2*k]);
00515 d14+=bt1;
00516 }
00517 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00);
00518 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02);
00519 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04);
00520 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06);
00521 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08);
00522 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10);
00523 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12);
00524 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14);
00525 }
00526 }
00527 #endif
00528
00529 #if 0
00530
00531 #define MM_LOAD1_PD(a,b) \
00532 { \
00533 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
00534
00535 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
00536 }
00537 #define MM_LOAD1U_PD(a,b) \
00538 { \
00539 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
00540 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
00541 }
00542 #define MM_MUL_PD(out,addr) \
00543 { out = _mm_mul_pd(out, *(__m128d*)addr); }
00544 #define BLOCK0_0(i,j,k) \
00545 { \
00546 __m128d bt0; \
00547 MM_LOAD1_PD(bt0, &BT[0+j*stride+2*k]); \
00548 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0; \
00549 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0; \
00550 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0; \
00551 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0; \
00552 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0; \
00553 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0; \
00554 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0; \
00555 MM_MUL_PD(bt0, &C[0+(i+14)*stride+2*k]); \
00556 d14+=bt0; \
00557 }
00558 #define BLOCK0_1(i,j,k) \
00559 { \
00560 __m128d bt1; \
00561 MM_LOAD1U_PD(bt1, &BT[1+j*stride+2*k]); \
00562 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt1; \
00563 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt1; \
00564 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt1; \
00565 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt1; \
00566 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt1; \
00567 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt1; \
00568 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt1; \
00569 MM_MUL_PD(bt1, &C[0+(i+14)*stride+2*k]); \
00570 d14+=bt1; \
00571 }
00572 for (int j = 0; j < baseOrder; j+=2)
00573 for (int i = 0; i < baseOrder; i+=16)
00574 {
00575 {
00576 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]);
00577 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]);
00578 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]);
00579 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]);
00580 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]);
00581 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]);
00582 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]);
00583 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]);
00584 for (int k = 0; k < baseOrder; k+=32)
00585 {
00586 BLOCK0_0(i,j,(k+ 0));
00587 BLOCK0_0(i,j,(k+ 1));
00588 BLOCK0_0(i,j,(k+ 2));
00589 BLOCK0_0(i,j,(k+ 3));
00590 BLOCK0_0(i,j,(k+ 4));
00591 BLOCK0_0(i,j,(k+ 5));
00592 BLOCK0_0(i,j,(k+ 6));
00593 BLOCK0_0(i,j,(k+ 7));
00594 BLOCK0_0(i,j,(k+ 8));
00595 BLOCK0_0(i,j,(k+ 9));
00596 BLOCK0_0(i,j,(k+10));
00597 BLOCK0_0(i,j,(k+11));
00598 BLOCK0_0(i,j,(k+12));
00599 BLOCK0_0(i,j,(k+13));
00600 BLOCK0_0(i,j,(k+14));
00601 BLOCK0_0(i,j,(k+15));
00602 BLOCK0_0(i,j,(k+16));
00603 BLOCK0_0(i,j,(k+17));
00604 BLOCK0_0(i,j,(k+18));
00605 BLOCK0_0(i,j,(k+19));
00606 BLOCK0_0(i,j,(k+20));
00607 BLOCK0_0(i,j,(k+21));
00608 BLOCK0_0(i,j,(k+22));
00609 BLOCK0_0(i,j,(k+23));
00610 BLOCK0_0(i,j,(k+24));
00611 BLOCK0_0(i,j,(k+25));
00612 BLOCK0_0(i,j,(k+26));
00613 BLOCK0_0(i,j,(k+27));
00614 BLOCK0_0(i,j,(k+28));
00615 BLOCK0_0(i,j,(k+29));
00616 BLOCK0_0(i,j,(k+30));
00617 BLOCK0_0(i,j,(k+31));
00618 }
00619 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00);
00620 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02);
00621 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04);
00622 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06);
00623 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08);
00624 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10);
00625 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12);
00626 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14);
00627 }
00628
00629 {
00630 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]);
00631 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]);
00632 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]);
00633 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]);
00634 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]);
00635 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]);
00636 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]);
00637 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]);
00638 for (int k = 0; k < baseOrder; k+=32)
00639 {
00640 BLOCK0_1(i,j,(k+ 0));
00641 BLOCK0_1(i,j,(k+ 1));
00642 BLOCK0_1(i,j,(k+ 2));
00643 BLOCK0_1(i,j,(k+ 3));
00644 BLOCK0_1(i,j,(k+ 4));
00645 BLOCK0_1(i,j,(k+ 5));
00646 BLOCK0_1(i,j,(k+ 6));
00647 BLOCK0_1(i,j,(k+ 7));
00648 BLOCK0_1(i,j,(k+ 8));
00649 BLOCK0_1(i,j,(k+ 9));
00650 BLOCK0_1(i,j,(k+10));
00651 BLOCK0_1(i,j,(k+11));
00652 BLOCK0_1(i,j,(k+12));
00653 BLOCK0_1(i,j,(k+13));
00654 BLOCK0_1(i,j,(k+14));
00655 BLOCK0_1(i,j,(k+15));
00656 BLOCK0_1(i,j,(k+16));
00657 BLOCK0_1(i,j,(k+17));
00658 BLOCK0_1(i,j,(k+18));
00659 BLOCK0_1(i,j,(k+19));
00660 BLOCK0_1(i,j,(k+20));
00661 BLOCK0_1(i,j,(k+21));
00662 BLOCK0_1(i,j,(k+22));
00663 BLOCK0_1(i,j,(k+23));
00664 BLOCK0_1(i,j,(k+24));
00665 BLOCK0_1(i,j,(k+25));
00666 BLOCK0_1(i,j,(k+26));
00667 BLOCK0_1(i,j,(k+27));
00668 BLOCK0_1(i,j,(k+28));
00669 BLOCK0_1(i,j,(k+29));
00670 BLOCK0_1(i,j,(k+30));
00671 BLOCK0_1(i,j,(k+31));
00672 }
00673 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00);
00674 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02);
00675 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04);
00676 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06);
00677 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08);
00678 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10);
00679 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12);
00680 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14);
00681 }
00682 }
00683 #endif
00684
00685 #if 1
00686
00687 #define MM_LOAD1_PD(a,b) \
00688 { \
00689 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
00690 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
00691 }
00692 #define MM_LOAD1U_PD(a,b) \
00693 { \
00694 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
00695 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
00696 }
00697 #define MM_MUL_PD(out,addr) \
00698 { out = _mm_mul_pd(out, *(__m128d*)addr); }
00699 #define BLOCK0_0(i,j,k) \
00700 { \
00701 __m128d bt0; \
00702 MM_LOAD1_PD(bt0, &BT[0+j*stride+2*k]); \
00703 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0; \
00704 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0; \
00705 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0; \
00706 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0; \
00707 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0; \
00708 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0; \
00709 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0; \
00710 MM_MUL_PD(bt0, &C[0+(i+14)*stride+2*k]); \
00711 d14+=bt0; \
00712 }
00713 #define BLOCK0_1(i,j,k) \
00714 { \
00715 __m128d bt1; \
00716 MM_LOAD1U_PD(bt1, &BT[1+j*stride+2*k]); \
00717 d00+=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt1; \
00718 d02+=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt1; \
00719 d04+=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt1; \
00720 d06+=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt1; \
00721 d08+=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt1; \
00722 d10+=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt1; \
00723 d12+=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt1; \
00724 MM_MUL_PD(bt1, &C[0+(i+14)*stride+2*k]); \
00725 d14+=bt1; \
00726 }
00727 #define BLOCK1_0(i,j) \
00728 { \
00729 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]); \
00730 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]); \
00731 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]); \
00732 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]); \
00733 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]); \
00734 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]); \
00735 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]); \
00736 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]); \
00737 for (int k = 0; k < baseOrder; k+=32) \
00738 { \
00739 BLOCK0_0(i,j,(k+ 0)); \
00740 BLOCK0_0(i,j,(k+ 1)); \
00741 BLOCK0_0(i,j,(k+ 2)); \
00742 BLOCK0_0(i,j,(k+ 3)); \
00743 BLOCK0_0(i,j,(k+ 4)); \
00744 BLOCK0_0(i,j,(k+ 5)); \
00745 BLOCK0_0(i,j,(k+ 6)); \
00746 BLOCK0_0(i,j,(k+ 7)); \
00747 BLOCK0_0(i,j,(k+ 8)); \
00748 BLOCK0_0(i,j,(k+ 9)); \
00749 BLOCK0_0(i,j,(k+10)); \
00750 BLOCK0_0(i,j,(k+11)); \
00751 BLOCK0_0(i,j,(k+12)); \
00752 BLOCK0_0(i,j,(k+13)); \
00753 BLOCK0_0(i,j,(k+14)); \
00754 BLOCK0_0(i,j,(k+15)); \
00755 BLOCK0_0(i,j,(k+16)); \
00756 BLOCK0_0(i,j,(k+17)); \
00757 BLOCK0_0(i,j,(k+18)); \
00758 BLOCK0_0(i,j,(k+19)); \
00759 BLOCK0_0(i,j,(k+20)); \
00760 BLOCK0_0(i,j,(k+21)); \
00761 BLOCK0_0(i,j,(k+22)); \
00762 BLOCK0_0(i,j,(k+23)); \
00763 BLOCK0_0(i,j,(k+24)); \
00764 BLOCK0_0(i,j,(k+25)); \
00765 BLOCK0_0(i,j,(k+26)); \
00766 BLOCK0_0(i,j,(k+27)); \
00767 BLOCK0_0(i,j,(k+28)); \
00768 BLOCK0_0(i,j,(k+29)); \
00769 BLOCK0_0(i,j,(k+30)); \
00770 BLOCK0_0(i,j,(k+31)); \
00771 } \
00772 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00); \
00773 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02); \
00774 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04); \
00775 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06); \
00776 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08); \
00777 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10); \
00778 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12); \
00779 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14); \
00780 }
00781 #define BLOCK1_1(i,j) \
00782 { \
00783 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]); \
00784 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]); \
00785 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]); \
00786 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]); \
00787 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]); \
00788 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]); \
00789 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]); \
00790 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]); \
00791 for (int k = 0; k < baseOrder; k+=32) \
00792 { \
00793 BLOCK0_1(i,j,(k+ 0)); \
00794 BLOCK0_1(i,j,(k+ 1)); \
00795 BLOCK0_1(i,j,(k+ 2)); \
00796 BLOCK0_1(i,j,(k+ 3)); \
00797 BLOCK0_1(i,j,(k+ 4)); \
00798 BLOCK0_1(i,j,(k+ 5)); \
00799 BLOCK0_1(i,j,(k+ 6)); \
00800 BLOCK0_1(i,j,(k+ 7)); \
00801 BLOCK0_1(i,j,(k+ 8)); \
00802 BLOCK0_1(i,j,(k+ 9)); \
00803 BLOCK0_1(i,j,(k+10)); \
00804 BLOCK0_1(i,j,(k+11)); \
00805 BLOCK0_1(i,j,(k+12)); \
00806 BLOCK0_1(i,j,(k+13)); \
00807 BLOCK0_1(i,j,(k+14)); \
00808 BLOCK0_1(i,j,(k+15)); \
00809 BLOCK0_1(i,j,(k+16)); \
00810 BLOCK0_1(i,j,(k+17)); \
00811 BLOCK0_1(i,j,(k+18)); \
00812 BLOCK0_1(i,j,(k+19)); \
00813 BLOCK0_1(i,j,(k+20)); \
00814 BLOCK0_1(i,j,(k+21)); \
00815 BLOCK0_1(i,j,(k+22)); \
00816 BLOCK0_1(i,j,(k+23)); \
00817 BLOCK0_1(i,j,(k+24)); \
00818 BLOCK0_1(i,j,(k+25)); \
00819 BLOCK0_1(i,j,(k+26)); \
00820 BLOCK0_1(i,j,(k+27)); \
00821 BLOCK0_1(i,j,(k+28)); \
00822 BLOCK0_1(i,j,(k+29)); \
00823 BLOCK0_1(i,j,(k+30)); \
00824 BLOCK0_1(i,j,(k+31)); \
00825 } \
00826 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00); \
00827 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02); \
00828 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04); \
00829 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06); \
00830 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08); \
00831 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10); \
00832 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12); \
00833 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14); \
00834 }
00835
00836 for (int j = 0; j < baseOrder; j+=2)
00837 {
00838 BLOCK1_0( 0,j);
00839 BLOCK1_1( 0,j);
00840 BLOCK1_0(16,j);
00841 BLOCK1_1(16,j);
00842 }
00843 #endif
00844
00845 }
00846
00847
00848
00849 #undef MM_LOAD1_PD
00850 #undef MM_LOAD1U_PD
00851 #undef MM_MUL_PD
00852 #undef BLOCK0_0
00853 #undef BLOCK0_1
00854 #undef BLOCK1_0
00855 #undef BLOCK1_1
00856
00857
00858 template <unsigned long MaskA, typename PA,
00859 unsigned long MaskB, typename PB,
00860 unsigned long MaskC, typename PC,
00861 typename Backup>
00862 void gen_platform_dmat_dmat_mult_ft<morton_dense<double, MaskA, PA>, morton_dense<double, MaskB, PB>,
00863 morton_dense<double, MaskC, PC>, assign::minus_sum, Backup>::
00864 mult_ass(double * D, double * C, double * BT) const
00865 {
00866
00867 const int baseOrder= 32,
00868 stride = baseOrder;
00869
00870
00871
00872
00873
00874
00875
00876
00877 #if 0
00878 for (int i = 0; i < baseOrder; i+=2)
00879 for (int j = 0; j < baseOrder; j+=2)
00880 for (int k = 0; k < baseOrder; k++)
00881 {
00882 D[0+(i)*stride+2*(j+0)] -= C[0+(i)*stride+2*k] * BT[0+(j)*stride+2*k];
00883 D[0+(i)*stride+2*(j+1)] -= C[0+(i)*stride+2*k] * BT[1+(j)*stride+2*k];
00884 D[1+(i)*stride+2*(j+0)] -= C[1+(i)*stride+2*k] * BT[0+(j)*stride+2*k];
00885 D[1+(i)*stride+2*(j+1)] -= C[1+(i)*stride+2*k] * BT[1+(j)*stride+2*k];
00886 }
00887 #endif
00888
00889 #if 0
00890
00891 for (int j = 0; j < baseOrder; j+=2)
00892 for (int i = 0; i < baseOrder; i+=16)
00893 {
00894 for (int k = 0; k < baseOrder; k++)
00895 {
00896 for (int i2 = i; i2 < i+16; i2+=2)
00897 {
00898 D[0+(i2)*stride+2*(j+0)] -= C[0+(i2)*stride+2*k] * BT[0+(j)*stride+2*k];
00899 D[1+(i2)*stride+2*(j+0)] -= C[1+(i2)*stride+2*k] * BT[0+(j)*stride+2*k];
00900 }
00901 }
00902 for (int k = 0; k < baseOrder; k++)
00903 {
00904 for (int i2 = i; i2 < i+16; i2+=2)
00905 {
00906 D[0+(i2)*stride+2*(j+1)] -= C[0+(i2)*stride+2*k] * BT[1+(j)*stride+2*k];
00907 D[1+(i2)*stride+2*(j+1)] -= C[1+(i2)*stride+2*k] * BT[1+(j)*stride+2*k];
00908 }
00909 }
00910 }
00911 #endif
00912
00913 #if 0
00914
00915
00916 for (int j = 0; j < baseOrder; j+=2)
00917 for (int i = 0; i < baseOrder; i+=16)
00918 {
00919 for (int k = 0; k < baseOrder; k++)
00920 {
00921 D[0+(i+ 0)*stride+2*(j+0)]-=C[0+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00922 D[1+(i+ 0)*stride+2*(j+0)]-=C[1+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00923 D[0+(i+ 2)*stride+2*(j+0)]-=C[0+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00924 D[1+(i+ 2)*stride+2*(j+0)]-=C[1+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00925 D[0+(i+ 4)*stride+2*(j+0)]-=C[0+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00926 D[1+(i+ 4)*stride+2*(j+0)]-=C[1+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00927 D[0+(i+ 6)*stride+2*(j+0)]-=C[0+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00928 D[1+(i+ 6)*stride+2*(j+0)]-=C[1+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00929 D[0+(i+ 8)*stride+2*(j+0)]-=C[0+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00930 D[1+(i+ 8)*stride+2*(j+0)]-=C[1+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00931 D[0+(i+10)*stride+2*(j+0)]-=C[0+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00932 D[1+(i+10)*stride+2*(j+0)]-=C[1+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00933 D[0+(i+12)*stride+2*(j+0)]-=C[0+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00934 D[1+(i+12)*stride+2*(j+0)]-=C[1+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00935 D[0+(i+14)*stride+2*(j+0)]-=C[0+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00936 D[1+(i+14)*stride+2*(j+0)]-=C[1+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00937 }
00938 for (int k = 0; k < baseOrder; k++)
00939 {
00940 D[0+(i+ 0)*stride+2*(j+1)]-=C[0+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
00941 D[1+(i+ 0)*stride+2*(j+1)]-=C[1+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
00942 D[0+(i+ 2)*stride+2*(j+1)]-=C[0+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
00943 D[1+(i+ 2)*stride+2*(j+1)]-=C[1+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
00944 D[0+(i+ 4)*stride+2*(j+1)]-=C[0+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
00945 D[1+(i+ 4)*stride+2*(j+1)]-=C[1+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
00946 D[0+(i+ 6)*stride+2*(j+1)]-=C[0+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
00947 D[1+(i+ 6)*stride+2*(j+1)]-=C[1+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
00948 D[0+(i+ 8)*stride+2*(j+1)]-=C[0+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
00949 D[1+(i+ 8)*stride+2*(j+1)]-=C[1+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
00950 D[0+(i+10)*stride+2*(j+1)]-=C[0+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
00951 D[1+(i+10)*stride+2*(j+1)]-=C[1+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
00952 D[0+(i+12)*stride+2*(j+1)]-=C[0+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
00953 D[1+(i+12)*stride+2*(j+1)]-=C[1+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
00954 D[0+(i+14)*stride+2*(j+1)]-=C[0+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
00955 D[1+(i+14)*stride+2*(j+1)]-=C[1+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
00956 }
00957 }
00958 #endif
00959
00960 #if 0
00961
00962 for (int j = 0; j < baseOrder; j+=2)
00963 for (int i = 0; i < baseOrder; i+=16)
00964 {
00965 {
00966 double d00 = D[0+(i+ 0)*stride+2*(j+0)];
00967 double d01 = D[1+(i+ 0)*stride+2*(j+0)];
00968 double d02 = D[0+(i+ 2)*stride+2*(j+0)];
00969 double d03 = D[1+(i+ 2)*stride+2*(j+0)];
00970 double d04 = D[0+(i+ 4)*stride+2*(j+0)];
00971 double d05 = D[1+(i+ 4)*stride+2*(j+0)];
00972 double d06 = D[0+(i+ 6)*stride+2*(j+0)];
00973 double d07 = D[1+(i+ 6)*stride+2*(j+0)];
00974 double d08 = D[0+(i+ 8)*stride+2*(j+0)];
00975 double d09 = D[1+(i+ 8)*stride+2*(j+0)];
00976 double d10 = D[0+(i+10)*stride+2*(j+0)];
00977 double d11 = D[1+(i+10)*stride+2*(j+0)];
00978 double d12 = D[0+(i+12)*stride+2*(j+0)];
00979 double d13 = D[1+(i+12)*stride+2*(j+0)];
00980 double d14 = D[0+(i+14)*stride+2*(j+0)];
00981 double d15 = D[1+(i+14)*stride+2*(j+0)];
00982 for (int k = 0; k < baseOrder; k++)
00983 {
00984 d00-=C[0+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00985 d01-=C[1+(i+ 0)*stride+2*k]*BT[0+j*stride+2*k];
00986 d02-=C[0+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00987 d03-=C[1+(i+ 2)*stride+2*k]*BT[0+j*stride+2*k];
00988 d04-=C[0+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00989 d05-=C[1+(i+ 4)*stride+2*k]*BT[0+j*stride+2*k];
00990 d06-=C[0+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00991 d07-=C[1+(i+ 6)*stride+2*k]*BT[0+j*stride+2*k];
00992 d08-=C[0+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00993 d09-=C[1+(i+ 8)*stride+2*k]*BT[0+j*stride+2*k];
00994 d10-=C[0+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00995 d11-=C[1+(i+10)*stride+2*k]*BT[0+j*stride+2*k];
00996 d12-=C[0+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00997 d13-=C[1+(i+12)*stride+2*k]*BT[0+j*stride+2*k];
00998 d14-=C[0+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
00999 d15-=C[1+(i+14)*stride+2*k]*BT[0+j*stride+2*k];
01000 }
01001 D[0+(i+ 0)*stride+2*(j+0)] = d00;
01002 D[1+(i+ 0)*stride+2*(j+0)] = d01;
01003 D[0+(i+ 2)*stride+2*(j+0)] = d02;
01004 D[1+(i+ 2)*stride+2*(j+0)] = d03;
01005 D[0+(i+ 4)*stride+2*(j+0)] = d04;
01006 D[1+(i+ 4)*stride+2*(j+0)] = d05;
01007 D[0+(i+ 6)*stride+2*(j+0)] = d06;
01008 D[1+(i+ 6)*stride+2*(j+0)] = d07;
01009 D[0+(i+ 8)*stride+2*(j+0)] = d08;
01010 D[1+(i+ 8)*stride+2*(j+0)] = d09;
01011 D[0+(i+10)*stride+2*(j+0)] = d10;
01012 D[1+(i+10)*stride+2*(j+0)] = d11;
01013 D[0+(i+12)*stride+2*(j+0)] = d12;
01014 D[1+(i+12)*stride+2*(j+0)] = d13;
01015 D[0+(i+14)*stride+2*(j+0)] = d14;
01016 D[1+(i+14)*stride+2*(j+0)] = d15;
01017 }
01018
01019 {
01020 double d00 = D[0+(i+ 0)*stride+2*(j+1)];
01021 double d01 = D[1+(i+ 0)*stride+2*(j+1)];
01022 double d02 = D[0+(i+ 2)*stride+2*(j+1)];
01023 double d03 = D[1+(i+ 2)*stride+2*(j+1)];
01024 double d04 = D[0+(i+ 4)*stride+2*(j+1)];
01025 double d05 = D[1+(i+ 4)*stride+2*(j+1)];
01026 double d06 = D[0+(i+ 6)*stride+2*(j+1)];
01027 double d07 = D[1+(i+ 6)*stride+2*(j+1)];
01028 double d08 = D[0+(i+ 8)*stride+2*(j+1)];
01029 double d09 = D[1+(i+ 8)*stride+2*(j+1)];
01030 double d10 = D[0+(i+10)*stride+2*(j+1)];
01031 double d11 = D[1+(i+10)*stride+2*(j+1)];
01032 double d12 = D[0+(i+12)*stride+2*(j+1)];
01033 double d13 = D[1+(i+12)*stride+2*(j+1)];
01034 double d14 = D[0+(i+14)*stride+2*(j+1)];
01035 double d15 = D[1+(i+14)*stride+2*(j+1)];
01036 for (int k = 0; k < baseOrder; k++)
01037 {
01038 d00-=C[0+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
01039 d01-=C[1+(i+ 0)*stride+2*k]*BT[1+j*stride+2*k];
01040 d02-=C[0+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
01041 d03-=C[1+(i+ 2)*stride+2*k]*BT[1+j*stride+2*k];
01042 d04-=C[0+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
01043 d05-=C[1+(i+ 4)*stride+2*k]*BT[1+j*stride+2*k];
01044 d06-=C[0+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
01045 d07-=C[1+(i+ 6)*stride+2*k]*BT[1+j*stride+2*k];
01046 d08-=C[0+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
01047 d09-=C[1+(i+ 8)*stride+2*k]*BT[1+j*stride+2*k];
01048 d10-=C[0+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
01049 d11-=C[1+(i+10)*stride+2*k]*BT[1+j*stride+2*k];
01050 d12-=C[0+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
01051 d13-=C[1+(i+12)*stride+2*k]*BT[1+j*stride+2*k];
01052 d14-=C[0+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
01053 d15-=C[1+(i+14)*stride+2*k]*BT[1+j*stride+2*k];
01054 }
01055 D[0+(i+ 0)*stride+2*(j+1)] = d00;
01056 D[1+(i+ 0)*stride+2*(j+1)] = d01;
01057 D[0+(i+ 2)*stride+2*(j+1)] = d02;
01058 D[1+(i+ 2)*stride+2*(j+1)] = d03;
01059 D[0+(i+ 4)*stride+2*(j+1)] = d04;
01060 D[1+(i+ 4)*stride+2*(j+1)] = d05;
01061 D[0+(i+ 6)*stride+2*(j+1)] = d06;
01062 D[1+(i+ 6)*stride+2*(j+1)] = d07;
01063 D[0+(i+ 8)*stride+2*(j+1)] = d08;
01064 D[1+(i+ 8)*stride+2*(j+1)] = d09;
01065 D[0+(i+10)*stride+2*(j+1)] = d10;
01066 D[1+(i+10)*stride+2*(j+1)] = d11;
01067 D[0+(i+12)*stride+2*(j+1)] = d12;
01068 D[1+(i+12)*stride+2*(j+1)] = d13;
01069 D[0+(i+14)*stride+2*(j+1)] = d14;
01070 D[1+(i+14)*stride+2*(j+1)] = d15;
01071 }
01072 }
01073 #endif
01074
01075 #if 0
01076
01077 for (int j = 0; j < baseOrder; j+=2)
01078 for (int i = 0; i < baseOrder; i+=16)
01079 {
01080 {
01081 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]);
01082 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]);
01083 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]);
01084 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]);
01085 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]);
01086 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]);
01087 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]);
01088 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]);
01089 for (int k = 0; k < baseOrder; k++)
01090 {
01091 __m128d bt0 = _mm_load1_pd(&BT[0+j*stride+2*k]);
01092 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0;
01093 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0;
01094 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0;
01095 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0;
01096 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0;
01097 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0;
01098 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0;
01099 d14-=_mm_load_pd(&C[0+(i+14)*stride+2*k])*bt0;
01100 }
01101 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00);
01102 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02);
01103 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04);
01104 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06);
01105 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08);
01106 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10);
01107 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12);
01108 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14);
01109 }
01110
01111 {
01112 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]);
01113 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]);
01114 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]);
01115 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]);
01116 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]);
01117 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]);
01118 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]);
01119 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]);
01120 for (int k = 0; k < baseOrder; k++)
01121 {
01122 __m128d bt0 = _mm_load1_pd(&BT[1+j*stride+2*k]);
01123 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0;
01124 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0;
01125 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0;
01126 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0;
01127 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0;
01128 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0;
01129 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0;
01130 d14-=_mm_load_pd(&C[0+(i+14)*stride+2*k])*bt0;
01131 }
01132 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00);
01133 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02);
01134 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04);
01135 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06);
01136 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08);
01137 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10);
01138 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12);
01139 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14);
01140 }
01141 }
01142 #endif
01143
01144 #if 0
01145
01146 #define MM_LOAD1_PD(a,b) \
01147 { \
01148 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
01149 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
01150 }
01151 #define MM_LOAD1U_PD(a,b) \
01152 { \
01153 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
01154 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
01155 }
01156 #define MM_MUL_PD(out,addr) \
01157 { out = _mm_mul_pd(out, *(__m128d*)addr); }
01158 for (int j = 0; j < baseOrder; j+=2)
01159 for (int i = 0; i < baseOrder; i+=16)
01160 {
01161 {
01162 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]);
01163 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]);
01164 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]);
01165 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]);
01166 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]);
01167 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]);
01168 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]);
01169 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]);
01170 for (int k = 0; k < baseOrder; k++)
01171 {
01172 __m128d bt0;
01173 MM_LOAD1_PD(bt0, &BT[0+j*stride+2*k]);
01174 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0;
01175 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0;
01176 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0;
01177 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0;
01178 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0;
01179 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0;
01180 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0;
01181 MM_MUL_PD(bt0, &C[0+(i+14)*stride+2*k]);
01182 d14-=bt0;
01183 }
01184 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00);
01185 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02);
01186 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04);
01187 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06);
01188 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08);
01189 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10);
01190 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12);
01191 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14);
01192 }
01193
01194 {
01195 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]);
01196 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]);
01197 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]);
01198 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]);
01199 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]);
01200 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]);
01201 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]);
01202 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]);
01203 for (int k = 0; k < baseOrder; k++)
01204 {
01205 __m128d bt1;
01206 MM_LOAD1U_PD(bt1, &BT[1+j*stride+2*k]);
01207 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt1;
01208 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt1;
01209 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt1;
01210 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt1;
01211 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt1;
01212 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt1;
01213 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt1;
01214 MM_MUL_PD(bt1, &C[0+(i+14)*stride+2*k]);
01215 d14-=bt1;
01216 }
01217 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00);
01218 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02);
01219 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04);
01220 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06);
01221 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08);
01222 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10);
01223 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12);
01224 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14);
01225 }
01226 }
01227 #endif
01228
01229 #if 0
01230
01231 #define MM_LOAD1_PD(a,b) \
01232 { \
01233 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
01234 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
01235 }
01236 #define MM_LOAD1U_PD(a,b) \
01237 { \
01238 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
01239 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
01240 }
01241 #define MM_MUL_PD(out,addr) \
01242 { out = _mm_mul_pd(out, *(__m128d*)addr); }
01243 #define BLOCK0_0(i,j,k) \
01244 { \
01245 __m128d bt0; \
01246 MM_LOAD1_PD(bt0, &BT[0+j*stride+2*k]); \
01247 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0; \
01248 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0; \
01249 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0; \
01250 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0; \
01251 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0; \
01252 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0; \
01253 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0; \
01254 MM_MUL_PD(bt0, &C[0+(i+14)*stride+2*k]); \
01255 d14-=bt0; \
01256 }
01257 #define BLOCK0_1(i,j,k) \
01258 { \
01259 __m128d bt1; \
01260 MM_LOAD1U_PD(bt1, &BT[1+j*stride+2*k]); \
01261 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt1; \
01262 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt1; \
01263 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt1; \
01264 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt1; \
01265 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt1; \
01266 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt1; \
01267 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt1; \
01268 MM_MUL_PD(bt1, &C[0+(i+14)*stride+2*k]); \
01269 d14-=bt1; \
01270 }
01271 for (int j = 0; j < baseOrder; j+=2)
01272 for (int i = 0; i < baseOrder; i+=16)
01273 {
01274 {
01275 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]);
01276 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]);
01277 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]);
01278 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]);
01279 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]);
01280 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]);
01281 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]);
01282 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]);
01283 for (int k = 0; k < baseOrder; k+=32)
01284 {
01285 BLOCK0_0(i,j,(k+ 0));
01286 BLOCK0_0(i,j,(k+ 1));
01287 BLOCK0_0(i,j,(k+ 2));
01288 BLOCK0_0(i,j,(k+ 3));
01289 BLOCK0_0(i,j,(k+ 4));
01290 BLOCK0_0(i,j,(k+ 5));
01291 BLOCK0_0(i,j,(k+ 6));
01292 BLOCK0_0(i,j,(k+ 7));
01293 BLOCK0_0(i,j,(k+ 8));
01294 BLOCK0_0(i,j,(k+ 9));
01295 BLOCK0_0(i,j,(k+10));
01296 BLOCK0_0(i,j,(k+11));
01297 BLOCK0_0(i,j,(k+12));
01298 BLOCK0_0(i,j,(k+13));
01299 BLOCK0_0(i,j,(k+14));
01300 BLOCK0_0(i,j,(k+15));
01301 BLOCK0_0(i,j,(k+16));
01302 BLOCK0_0(i,j,(k+17));
01303 BLOCK0_0(i,j,(k+18));
01304 BLOCK0_0(i,j,(k+19));
01305 BLOCK0_0(i,j,(k+20));
01306 BLOCK0_0(i,j,(k+21));
01307 BLOCK0_0(i,j,(k+22));
01308 BLOCK0_0(i,j,(k+23));
01309 BLOCK0_0(i,j,(k+24));
01310 BLOCK0_0(i,j,(k+25));
01311 BLOCK0_0(i,j,(k+26));
01312 BLOCK0_0(i,j,(k+27));
01313 BLOCK0_0(i,j,(k+28));
01314 BLOCK0_0(i,j,(k+29));
01315 BLOCK0_0(i,j,(k+30));
01316 BLOCK0_0(i,j,(k+31));
01317 }
01318 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00);
01319 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02);
01320 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04);
01321 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06);
01322 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08);
01323 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10);
01324 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12);
01325 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14);
01326 }
01327
01328 {
01329 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]);
01330 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]);
01331 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]);
01332 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]);
01333 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]);
01334 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]);
01335 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]);
01336 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]);
01337 for (int k = 0; k < baseOrder; k+=32)
01338 {
01339 BLOCK0_1(i,j,(k+ 0));
01340 BLOCK0_1(i,j,(k+ 1));
01341 BLOCK0_1(i,j,(k+ 2));
01342 BLOCK0_1(i,j,(k+ 3));
01343 BLOCK0_1(i,j,(k+ 4));
01344 BLOCK0_1(i,j,(k+ 5));
01345 BLOCK0_1(i,j,(k+ 6));
01346 BLOCK0_1(i,j,(k+ 7));
01347 BLOCK0_1(i,j,(k+ 8));
01348 BLOCK0_1(i,j,(k+ 9));
01349 BLOCK0_1(i,j,(k+10));
01350 BLOCK0_1(i,j,(k+11));
01351 BLOCK0_1(i,j,(k+12));
01352 BLOCK0_1(i,j,(k+13));
01353 BLOCK0_1(i,j,(k+14));
01354 BLOCK0_1(i,j,(k+15));
01355 BLOCK0_1(i,j,(k+16));
01356 BLOCK0_1(i,j,(k+17));
01357 BLOCK0_1(i,j,(k+18));
01358 BLOCK0_1(i,j,(k+19));
01359 BLOCK0_1(i,j,(k+20));
01360 BLOCK0_1(i,j,(k+21));
01361 BLOCK0_1(i,j,(k+22));
01362 BLOCK0_1(i,j,(k+23));
01363 BLOCK0_1(i,j,(k+24));
01364 BLOCK0_1(i,j,(k+25));
01365 BLOCK0_1(i,j,(k+26));
01366 BLOCK0_1(i,j,(k+27));
01367 BLOCK0_1(i,j,(k+28));
01368 BLOCK0_1(i,j,(k+29));
01369 BLOCK0_1(i,j,(k+30));
01370 BLOCK0_1(i,j,(k+31));
01371 }
01372 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00);
01373 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02);
01374 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04);
01375 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06);
01376 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08);
01377 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10);
01378 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12);
01379 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14);
01380 }
01381 }
01382 #endif
01383
01384 #if 1
01385
01386 #define MM_LOAD1_PD(a,b) \
01387 { \
01388 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
01389 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
01390 }
01391 #define MM_LOAD1U_PD(a,b) \
01392 { \
01393 __asm__("movlpd %1, %0" : "=x" (a) : "m"(*b)); \
01394 __asm__("movhpd %1, %0" : "=x" (a) : "m"(*b), "0" (a)); \
01395 }
01396 #define MM_MUL_PD(out,addr) \
01397 { out = _mm_mul_pd(out, *(__m128d*)addr); }
01398 #define BLOCK0_0(i,j,k) \
01399 { \
01400 __m128d bt0; \
01401 MM_LOAD1_PD(bt0, &BT[0+j*stride+2*k]); \
01402 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt0; \
01403 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt0; \
01404 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt0; \
01405 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt0; \
01406 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt0; \
01407 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt0; \
01408 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt0; \
01409 MM_MUL_PD(bt0, &C[0+(i+14)*stride+2*k]); \
01410 d14-=bt0; \
01411 }
01412 #define BLOCK0_1(i,j,k) \
01413 { \
01414 __m128d bt1; \
01415 MM_LOAD1U_PD(bt1, &BT[1+j*stride+2*k]); \
01416 d00-=_mm_load_pd(&C[0+(i+ 0)*stride+2*k])*bt1; \
01417 d02-=_mm_load_pd(&C[0+(i+ 2)*stride+2*k])*bt1; \
01418 d04-=_mm_load_pd(&C[0+(i+ 4)*stride+2*k])*bt1; \
01419 d06-=_mm_load_pd(&C[0+(i+ 6)*stride+2*k])*bt1; \
01420 d08-=_mm_load_pd(&C[0+(i+ 8)*stride+2*k])*bt1; \
01421 d10-=_mm_load_pd(&C[0+(i+10)*stride+2*k])*bt1; \
01422 d12-=_mm_load_pd(&C[0+(i+12)*stride+2*k])*bt1; \
01423 MM_MUL_PD(bt1, &C[0+(i+14)*stride+2*k]); \
01424 d14-=bt1; \
01425 }
01426 #define BLOCK1_0(i,j) \
01427 { \
01428 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+0)]); \
01429 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+0)]); \
01430 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+0)]); \
01431 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+0)]); \
01432 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+0)]); \
01433 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+0)]); \
01434 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+0)]); \
01435 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+0)]); \
01436 for (int k = 0; k < baseOrder; k+=32) \
01437 { \
01438 BLOCK0_0(i,j,(k+ 0)); \
01439 BLOCK0_0(i,j,(k+ 1)); \
01440 BLOCK0_0(i,j,(k+ 2)); \
01441 BLOCK0_0(i,j,(k+ 3)); \
01442 BLOCK0_0(i,j,(k+ 4)); \
01443 BLOCK0_0(i,j,(k+ 5)); \
01444 BLOCK0_0(i,j,(k+ 6)); \
01445 BLOCK0_0(i,j,(k+ 7)); \
01446 BLOCK0_0(i,j,(k+ 8)); \
01447 BLOCK0_0(i,j,(k+ 9)); \
01448 BLOCK0_0(i,j,(k+10)); \
01449 BLOCK0_0(i,j,(k+11)); \
01450 BLOCK0_0(i,j,(k+12)); \
01451 BLOCK0_0(i,j,(k+13)); \
01452 BLOCK0_0(i,j,(k+14)); \
01453 BLOCK0_0(i,j,(k+15)); \
01454 BLOCK0_0(i,j,(k+16)); \
01455 BLOCK0_0(i,j,(k+17)); \
01456 BLOCK0_0(i,j,(k+18)); \
01457 BLOCK0_0(i,j,(k+19)); \
01458 BLOCK0_0(i,j,(k+20)); \
01459 BLOCK0_0(i,j,(k+21)); \
01460 BLOCK0_0(i,j,(k+22)); \
01461 BLOCK0_0(i,j,(k+23)); \
01462 BLOCK0_0(i,j,(k+24)); \
01463 BLOCK0_0(i,j,(k+25)); \
01464 BLOCK0_0(i,j,(k+26)); \
01465 BLOCK0_0(i,j,(k+27)); \
01466 BLOCK0_0(i,j,(k+28)); \
01467 BLOCK0_0(i,j,(k+29)); \
01468 BLOCK0_0(i,j,(k+30)); \
01469 BLOCK0_0(i,j,(k+31)); \
01470 } \
01471 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+0)], d00); \
01472 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+0)], d02); \
01473 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+0)], d04); \
01474 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+0)], d06); \
01475 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+0)], d08); \
01476 _mm_store_pd(&D[0+(i+10)*stride+2*(j+0)], d10); \
01477 _mm_store_pd(&D[0+(i+12)*stride+2*(j+0)], d12); \
01478 _mm_store_pd(&D[0+(i+14)*stride+2*(j+0)], d14); \
01479 }
01480 #define BLOCK1_1(i,j) \
01481 { \
01482 __m128d d00 = _mm_load_pd(&D[0+(i+ 0)*stride+2*(j+1)]); \
01483 __m128d d02 = _mm_load_pd(&D[0+(i+ 2)*stride+2*(j+1)]); \
01484 __m128d d04 = _mm_load_pd(&D[0+(i+ 4)*stride+2*(j+1)]); \
01485 __m128d d06 = _mm_load_pd(&D[0+(i+ 6)*stride+2*(j+1)]); \
01486 __m128d d08 = _mm_load_pd(&D[0+(i+ 8)*stride+2*(j+1)]); \
01487 __m128d d10 = _mm_load_pd(&D[0+(i+10)*stride+2*(j+1)]); \
01488 __m128d d12 = _mm_load_pd(&D[0+(i+12)*stride+2*(j+1)]); \
01489 __m128d d14 = _mm_load_pd(&D[0+(i+14)*stride+2*(j+1)]); \
01490 for (int k = 0; k < baseOrder; k+=32) \
01491 { \
01492 BLOCK0_1(i,j,(k+ 0)); \
01493 BLOCK0_1(i,j,(k+ 1)); \
01494 BLOCK0_1(i,j,(k+ 2)); \
01495 BLOCK0_1(i,j,(k+ 3)); \
01496 BLOCK0_1(i,j,(k+ 4)); \
01497 BLOCK0_1(i,j,(k+ 5)); \
01498 BLOCK0_1(i,j,(k+ 6)); \
01499 BLOCK0_1(i,j,(k+ 7)); \
01500 BLOCK0_1(i,j,(k+ 8)); \
01501 BLOCK0_1(i,j,(k+ 9)); \
01502 BLOCK0_1(i,j,(k+10)); \
01503 BLOCK0_1(i,j,(k+11)); \
01504 BLOCK0_1(i,j,(k+12)); \
01505 BLOCK0_1(i,j,(k+13)); \
01506 BLOCK0_1(i,j,(k+14)); \
01507 BLOCK0_1(i,j,(k+15)); \
01508 BLOCK0_1(i,j,(k+16)); \
01509 BLOCK0_1(i,j,(k+17)); \
01510 BLOCK0_1(i,j,(k+18)); \
01511 BLOCK0_1(i,j,(k+19)); \
01512 BLOCK0_1(i,j,(k+20)); \
01513 BLOCK0_1(i,j,(k+21)); \
01514 BLOCK0_1(i,j,(k+22)); \
01515 BLOCK0_1(i,j,(k+23)); \
01516 BLOCK0_1(i,j,(k+24)); \
01517 BLOCK0_1(i,j,(k+25)); \
01518 BLOCK0_1(i,j,(k+26)); \
01519 BLOCK0_1(i,j,(k+27)); \
01520 BLOCK0_1(i,j,(k+28)); \
01521 BLOCK0_1(i,j,(k+29)); \
01522 BLOCK0_1(i,j,(k+30)); \
01523 BLOCK0_1(i,j,(k+31)); \
01524 } \
01525 _mm_store_pd(&D[0+(i+ 0)*stride+2*(j+1)], d00); \
01526 _mm_store_pd(&D[0+(i+ 2)*stride+2*(j+1)], d02); \
01527 _mm_store_pd(&D[0+(i+ 4)*stride+2*(j+1)], d04); \
01528 _mm_store_pd(&D[0+(i+ 6)*stride+2*(j+1)], d06); \
01529 _mm_store_pd(&D[0+(i+ 8)*stride+2*(j+1)], d08); \
01530 _mm_store_pd(&D[0+(i+10)*stride+2*(j+1)], d10); \
01531 _mm_store_pd(&D[0+(i+12)*stride+2*(j+1)], d12); \
01532 _mm_store_pd(&D[0+(i+14)*stride+2*(j+1)], d14); \
01533 }
01534
01535 for (int j = 0; j < baseOrder; j+=2)
01536 {
01537 BLOCK1_0( 0,j);
01538 BLOCK1_1( 0,j);
01539 BLOCK1_0(16,j);
01540 BLOCK1_1(16,j);
01541 }
01542 #endif
01543
01544 }
01545
01546
01547
01548 #undef MM_LOAD1_PD
01549 #undef MM_LOAD1U_PD
01550 #undef MM_MUL_PD
01551 #undef BLOCK0_0
01552 #undef BLOCK0_1
01553 #undef BLOCK1_0
01554 #undef BLOCK1_1
01555
01556
01557 }
01558
01559 #endif // MTL_USE_OPTERON_OPTIMIZATION
01560
01561 #endif // MTL_OPTERON_MATRIX_MULT_INCLUDE