Research Article
Effective SIMD Vectorization for Intel Xeon Phi Coprocessors
Pseudocode 1
Pseudocode without vectorizing “less-than-full-vector” loops.
misalign = &y0 & 63 | peeledTripCount = (63 misalign)/sizeof(float) | x = 10.0f; | do k0 = 0, peeledTripCount-1 // peeling loop | x = x + fsqrt(yk0) | enddo | x1_v512 = (m512)0 | x2_v512 = (m512)0 | mainTripCount = n ((n peeledTripCount) & 31) | do k1 = peeledTripCount, mainTripCount-1, 32 | x1_v512 = _mm512_add_ps(_mm512_fsqrt(yk1:16),x1_v512) | x2_v512 = _mm512_add_ps(_mm512_fsqrt(yk1+16:16), x2_v512) | enddo | // perform vector add on two vector x1_v512 and x2_v512 | x1_v512 = _mm512_add_ps(x1_v512, x2_512); | // perform horizontal add on all elements of x1_v512, and | // the add x for using its value in the remainder loop | x = x + _mm512_hadd_ps(x1_512) | do k2 = mainTripCount, n // Remainder loop | x = x + fsqrt(yk2) | enddo |
|