transpose.c (371B)
1 #include <x86intrin.h> 2 void transpose4(float A[4][4]) 3 { 4 __m128 a = _mm_load_ps(&A[0][0]); 5 __m128 b = _mm_load_ps(&A[1][0]); 6 __m128 c = _mm_load_ps(&A[2][0]); 7 __m128 d = _mm_load_ps(&A[3][0]); 8 9 _MM_TRANSPOSE4_PS(a, b, c, d); 10 11 _mm_store_ps(&A[0][0], a); 12 _mm_store_ps(&A[1][0], b); 13 _mm_store_ps(&A[2][0], c); 14 _mm_store_ps(&A[3][0], d); 15 }