loop_unroll_prefetch.cpp (1136B)
1 #include <chrono> 2 #include <iostream> 3 4 #define N (1 << 20) 5 int main(void) 6 { 7 double *A = new double[N]; 8 double *B = new double[N]; 9 double *C = new double[N]; 10 for (int i = 0; i < N; ++i) { 11 A[i] = 0; 12 B[i] = i + 0; 13 C[i] = i + 2; 14 } 15 16 typedef std::chrono::high_resolution_clock Clock; 17 auto t1 = Clock::now(); 18 // manual 4-fold loop unroll 19 for (int i = 0; i < N; i += 4) { 20 __builtin_prefetch(&A[i + 4], 1, 1); 21 __builtin_prefetch(&B[i + 4], 0, 1); 22 __builtin_prefetch(&C[i + 4], 0, 1); 23 A[i + 0] = A[i + 0] * B[i + 0] + C[i + 0]; 24 A[i + 1] = A[i + 1] * B[i + 1] + C[i + 1]; 25 A[i + 2] = A[i + 2] * B[i + 2] + C[i + 2]; 26 A[i + 3] = A[i + 3] * B[i + 3] + C[i + 3]; 27 } 28 auto t2 = Clock::now(); 29 std::cout 30 << "Time: " 31 << std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count() 32 << " ns\n"; 33 34 volatile double sum = 0.0; 35 for (int i = 0; i < N; ++i) { 36 sum += A[i]; 37 } 38 std::cout << "Result: " << sum << '\n'; 39 40 delete[] A; 41 delete[] B; 42 delete[] C; 43 return 0; 44 }