1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2016 Gael Guennebaud <[email protected]> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H 11 #define EIGEN_GENERAL_MATRIX_VECTOR_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 enum GEMVPacketSizeType { 18 GEMVPacketFull = 0, 19 GEMVPacketHalf, 20 GEMVPacketQuarter 21 }; 22 23 template <int N, typename T1, typename T2, typename T3> 24 struct gemv_packet_cond { typedef T3 type; }; 25 26 template <typename T1, typename T2, typename T3> 27 struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; }; 28 29 template <typename T1, typename T2, typename T3> 30 struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; }; 31 32 template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull> 33 class gemv_traits 34 { 35 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; 36 37 #define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ 38 typedef typename gemv_packet_cond<packet_size, \ 39 typename packet_traits<name ## Scalar>::type, \ 40 typename packet_traits<name ## Scalar>::half, \ 41 typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \ 42 prefix ## name ## Packet 43 44 PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); 45 PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); 46 PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); 47 #undef PACKET_DECL_COND_PREFIX 48 49 public: 50 enum { 51 Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && 52 unpacket_traits<_RhsPacket>::vectorizable && 53 int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), 54 LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, 55 RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, 56 ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 57 }; 58 59 typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket; 60 typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket; 61 typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket; 62 }; 63 64 65 /* Optimized col-major matrix * vector product: 66 * This algorithm processes the matrix per vertical panels, 67 * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. 68 * 69 * Mixing type logic: C += alpha * A * B 70 * | A | B |alpha| comments 71 * |real |cplx |cplx | no vectorization 72 * |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization 73 * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp 74 * |cplx |real |real | optimal case, vectorization possible via real-cplx mul 75 * 76 * The same reasoning apply for the transposed case. 77 */ 78 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> 79 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> 80 { 81 typedef gemv_traits<LhsScalar,RhsScalar> Traits; 82 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits; 83 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits; 84 85 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; 86 87 typedef typename Traits::LhsPacket LhsPacket; 88 typedef typename Traits::RhsPacket RhsPacket; 89 typedef typename Traits::ResPacket ResPacket; 90 91 typedef typename HalfTraits::LhsPacket LhsPacketHalf; 92 typedef typename HalfTraits::RhsPacket RhsPacketHalf; 93 typedef typename HalfTraits::ResPacket ResPacketHalf; 94 95 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; 96 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; 97 typedef typename QuarterTraits::ResPacket ResPacketQuarter; 98 99 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( 100 Index rows, Index cols, 101 const LhsMapper& lhs, 102 const RhsMapper& rhs, 103 ResScalar* res, Index resIncr, 104 RhsScalar alpha); 105 }; 106 107 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> 108 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( 109 Index rows, Index cols, 110 const LhsMapper& alhs, 111 const RhsMapper& rhs, 112 ResScalar* res, Index resIncr, 113 RhsScalar alpha) 114 { 115 EIGEN_UNUSED_VARIABLE(resIncr); 116 eigen_internal_assert(resIncr==1); 117 118 // The following copy tells the compiler that lhs's attributes are not modified outside this function 119 // This helps GCC to generate propoer code. 120 LhsMapper lhs(alhs); 121 122 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 123 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 124 conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half; 125 conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter; 126 127 const Index lhsStride = lhs.stride(); 128 // TODO: for padded aligned inputs, we could enable aligned reads 129 enum { LhsAlignment = Unaligned, 130 ResPacketSize = Traits::ResPacketSize, 131 ResPacketSizeHalf = HalfTraits::ResPacketSize, 132 ResPacketSizeQuarter = QuarterTraits::ResPacketSize, 133 LhsPacketSize = Traits::LhsPacketSize, 134 HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, 135 HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf 136 }; 137 138 const Index n8 = rows-8*ResPacketSize+1; 139 const Index n4 = rows-4*ResPacketSize+1; 140 const Index n3 = rows-3*ResPacketSize+1; 141 const Index n2 = rows-2*ResPacketSize+1; 142 const Index n1 = rows-1*ResPacketSize+1; 143 const Index n_half = rows-1*ResPacketSizeHalf+1; 144 const Index n_quarter = rows-1*ResPacketSizeQuarter+1; 145 146 // TODO: improve the following heuristic: 147 const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); 148 ResPacket palpha = pset1<ResPacket>(alpha); 149 ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha); 150 ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha); 151 152 for(Index j2=0; j2<cols; j2+=block_cols) 153 { 154 Index jend = numext::mini(j2+block_cols,cols); 155 Index i=0; 156 for(; i<n8; i+=ResPacketSize*8) 157 { 158 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 159 c1 = pset1<ResPacket>(ResScalar(0)), 160 c2 = pset1<ResPacket>(ResScalar(0)), 161 c3 = pset1<ResPacket>(ResScalar(0)), 162 c4 = pset1<ResPacket>(ResScalar(0)), 163 c5 = pset1<ResPacket>(ResScalar(0)), 164 c6 = pset1<ResPacket>(ResScalar(0)), 165 c7 = pset1<ResPacket>(ResScalar(0)); 166 167 for(Index j=j2; j<jend; j+=1) 168 { 169 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); 170 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); 171 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); 172 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); 173 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3); 174 c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4); 175 c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5); 176 c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6); 177 c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7); 178 } 179 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); 180 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); 181 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); 182 pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3))); 183 pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4))); 184 pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5))); 185 pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6))); 186 pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7))); 187 } 188 if(i<n4) 189 { 190 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 191 c1 = pset1<ResPacket>(ResScalar(0)), 192 c2 = pset1<ResPacket>(ResScalar(0)), 193 c3 = pset1<ResPacket>(ResScalar(0)); 194 195 for(Index j=j2; j<jend; j+=1) 196 { 197 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); 198 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); 199 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); 200 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); 201 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3); 202 } 203 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); 204 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); 205 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); 206 pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3))); 207 208 i+=ResPacketSize*4; 209 } 210 if(i<n3) 211 { 212 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 213 c1 = pset1<ResPacket>(ResScalar(0)), 214 c2 = pset1<ResPacket>(ResScalar(0)); 215 216 for(Index j=j2; j<jend; j+=1) 217 { 218 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); 219 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); 220 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); 221 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2); 222 } 223 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); 224 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); 225 pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2))); 226 227 i+=ResPacketSize*3; 228 } 229 if(i<n2) 230 { 231 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 232 c1 = pset1<ResPacket>(ResScalar(0)); 233 234 for(Index j=j2; j<jend; j+=1) 235 { 236 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); 237 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0); 238 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1); 239 } 240 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); 241 pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1))); 242 i+=ResPacketSize*2; 243 } 244 if(i<n1) 245 { 246 ResPacket c0 = pset1<ResPacket>(ResScalar(0)); 247 for(Index j=j2; j<jend; j+=1) 248 { 249 RhsPacket b0 = pset1<RhsPacket>(rhs(j,0)); 250 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); 251 } 252 pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0))); 253 i+=ResPacketSize; 254 } 255 if(HasHalf && i<n_half) 256 { 257 ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0)); 258 for(Index j=j2; j<jend; j+=1) 259 { 260 RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0)); 261 c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0); 262 } 263 pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0))); 264 i+=ResPacketSizeHalf; 265 } 266 if(HasQuarter && i<n_quarter) 267 { 268 ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0)); 269 for(Index j=j2; j<jend; j+=1) 270 { 271 RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0)); 272 c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0); 273 } 274 pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0))); 275 i+=ResPacketSizeQuarter; 276 } 277 for(;i<rows;++i) 278 { 279 ResScalar c0(0); 280 for(Index j=j2; j<jend; j+=1) 281 c0 += cj.pmul(lhs(i,j), rhs(j,0)); 282 res[i] += alpha*c0; 283 } 284 } 285 } 286 287 /* Optimized row-major matrix * vector product: 288 * This algorithm processes 4 rows at once that allows to both reduce 289 * the number of load/stores of the result by a factor 4 and to reduce 290 * the instruction dependency. Moreover, we know that all bands have the 291 * same alignment pattern. 292 * 293 * Mixing type logic: 294 * - alpha is always a complex (or converted to a complex) 295 * - no vectorization 296 */ 297 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> 298 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version> 299 { 300 typedef gemv_traits<LhsScalar,RhsScalar> Traits; 301 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits; 302 typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits; 303 304 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar; 305 306 typedef typename Traits::LhsPacket LhsPacket; 307 typedef typename Traits::RhsPacket RhsPacket; 308 typedef typename Traits::ResPacket ResPacket; 309 310 typedef typename HalfTraits::LhsPacket LhsPacketHalf; 311 typedef typename HalfTraits::RhsPacket RhsPacketHalf; 312 typedef typename HalfTraits::ResPacket ResPacketHalf; 313 314 typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; 315 typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; 316 typedef typename QuarterTraits::ResPacket ResPacketQuarter; 317 318 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( 319 Index rows, Index cols, 320 const LhsMapper& lhs, 321 const RhsMapper& rhs, 322 ResScalar* res, Index resIncr, 323 ResScalar alpha); 324 }; 325 326 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version> 327 EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run( 328 Index rows, Index cols, 329 const LhsMapper& alhs, 330 const RhsMapper& rhs, 331 ResScalar* res, Index resIncr, 332 ResScalar alpha) 333 { 334 // The following copy tells the compiler that lhs's attributes are not modified outside this function 335 // This helps GCC to generate propoer code. 336 LhsMapper lhs(alhs); 337 338 eigen_internal_assert(rhs.stride()==1); 339 conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj; 340 conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj; 341 conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half; 342 conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter; 343 344 // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, 345 // processing 8 rows at once might be counter productive wrt cache. 346 const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; 347 const Index n4 = rows-3; 348 const Index n2 = rows-1; 349 350 // TODO: for padded aligned inputs, we could enable aligned reads 351 enum { LhsAlignment = Unaligned, 352 ResPacketSize = Traits::ResPacketSize, 353 ResPacketSizeHalf = HalfTraits::ResPacketSize, 354 ResPacketSizeQuarter = QuarterTraits::ResPacketSize, 355 LhsPacketSize = Traits::LhsPacketSize, 356 LhsPacketSizeHalf = HalfTraits::LhsPacketSize, 357 LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, 358 HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, 359 HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf 360 }; 361 362 Index i=0; 363 for(; i<n8; i+=8) 364 { 365 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 366 c1 = pset1<ResPacket>(ResScalar(0)), 367 c2 = pset1<ResPacket>(ResScalar(0)), 368 c3 = pset1<ResPacket>(ResScalar(0)), 369 c4 = pset1<ResPacket>(ResScalar(0)), 370 c5 = pset1<ResPacket>(ResScalar(0)), 371 c6 = pset1<ResPacket>(ResScalar(0)), 372 c7 = pset1<ResPacket>(ResScalar(0)); 373 374 Index j=0; 375 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) 376 { 377 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); 378 379 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); 380 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); 381 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2); 382 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3); 383 c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4); 384 c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5); 385 c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6); 386 c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7); 387 } 388 ResScalar cc0 = predux(c0); 389 ResScalar cc1 = predux(c1); 390 ResScalar cc2 = predux(c2); 391 ResScalar cc3 = predux(c3); 392 ResScalar cc4 = predux(c4); 393 ResScalar cc5 = predux(c5); 394 ResScalar cc6 = predux(c6); 395 ResScalar cc7 = predux(c7); 396 for(; j<cols; ++j) 397 { 398 RhsScalar b0 = rhs(j,0); 399 400 cc0 += cj.pmul(lhs(i+0,j), b0); 401 cc1 += cj.pmul(lhs(i+1,j), b0); 402 cc2 += cj.pmul(lhs(i+2,j), b0); 403 cc3 += cj.pmul(lhs(i+3,j), b0); 404 cc4 += cj.pmul(lhs(i+4,j), b0); 405 cc5 += cj.pmul(lhs(i+5,j), b0); 406 cc6 += cj.pmul(lhs(i+6,j), b0); 407 cc7 += cj.pmul(lhs(i+7,j), b0); 408 } 409 res[(i+0)*resIncr] += alpha*cc0; 410 res[(i+1)*resIncr] += alpha*cc1; 411 res[(i+2)*resIncr] += alpha*cc2; 412 res[(i+3)*resIncr] += alpha*cc3; 413 res[(i+4)*resIncr] += alpha*cc4; 414 res[(i+5)*resIncr] += alpha*cc5; 415 res[(i+6)*resIncr] += alpha*cc6; 416 res[(i+7)*resIncr] += alpha*cc7; 417 } 418 for(; i<n4; i+=4) 419 { 420 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 421 c1 = pset1<ResPacket>(ResScalar(0)), 422 c2 = pset1<ResPacket>(ResScalar(0)), 423 c3 = pset1<ResPacket>(ResScalar(0)); 424 425 Index j=0; 426 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) 427 { 428 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); 429 430 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); 431 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); 432 c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2); 433 c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3); 434 } 435 ResScalar cc0 = predux(c0); 436 ResScalar cc1 = predux(c1); 437 ResScalar cc2 = predux(c2); 438 ResScalar cc3 = predux(c3); 439 for(; j<cols; ++j) 440 { 441 RhsScalar b0 = rhs(j,0); 442 443 cc0 += cj.pmul(lhs(i+0,j), b0); 444 cc1 += cj.pmul(lhs(i+1,j), b0); 445 cc2 += cj.pmul(lhs(i+2,j), b0); 446 cc3 += cj.pmul(lhs(i+3,j), b0); 447 } 448 res[(i+0)*resIncr] += alpha*cc0; 449 res[(i+1)*resIncr] += alpha*cc1; 450 res[(i+2)*resIncr] += alpha*cc2; 451 res[(i+3)*resIncr] += alpha*cc3; 452 } 453 for(; i<n2; i+=2) 454 { 455 ResPacket c0 = pset1<ResPacket>(ResScalar(0)), 456 c1 = pset1<ResPacket>(ResScalar(0)); 457 458 Index j=0; 459 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) 460 { 461 RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0); 462 463 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0); 464 c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1); 465 } 466 ResScalar cc0 = predux(c0); 467 ResScalar cc1 = predux(c1); 468 for(; j<cols; ++j) 469 { 470 RhsScalar b0 = rhs(j,0); 471 472 cc0 += cj.pmul(lhs(i+0,j), b0); 473 cc1 += cj.pmul(lhs(i+1,j), b0); 474 } 475 res[(i+0)*resIncr] += alpha*cc0; 476 res[(i+1)*resIncr] += alpha*cc1; 477 } 478 for(; i<rows; ++i) 479 { 480 ResPacket c0 = pset1<ResPacket>(ResScalar(0)); 481 ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0)); 482 ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0)); 483 Index j=0; 484 for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) 485 { 486 RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0); 487 c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0); 488 } 489 ResScalar cc0 = predux(c0); 490 if (HasHalf) { 491 for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) 492 { 493 RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0); 494 c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h); 495 } 496 cc0 += predux(c0_h); 497 } 498 if (HasQuarter) { 499 for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter) 500 { 501 RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0); 502 c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q); 503 } 504 cc0 += predux(c0_q); 505 } 506 for(; j<cols; ++j) 507 { 508 cc0 += cj.pmul(lhs(i,j), rhs(j,0)); 509 } 510 res[i*resIncr] += alpha*cc0; 511 } 512 } 513 514 } // end namespace internal 515 516 } // end namespace Eigen 517 518 #endif // EIGEN_GENERAL_MATRIX_VECTOR_H 519