sgemv.cpp (3799B)
1 #include "papi.h" 2 #include <cstdlib> 3 #include <iostream> 4 #include <numeric> 5 #include <vector> 6 7 // Broadwell CPU on cluster, you can get one with 8 // salloc -N1 -c32 -t 01:00:00 9 // 10 // Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz 11 // L1d cache: 32K 12 // L1i cache: 32K 13 // L2 cache: 256K 14 // L3 cache: 40960K 15 #define L1_SIZE_KB 32 16 #define L2_SIZE_KB 256 17 #define L3_SIZE_KB 40960 18 19 typedef float Real; 20 21 void sgemv(const Real *A, const Real *x, Real *__restrict__ y, const size_t n) 22 { 23 for (size_t j = 0; j < n; ++j) { 24 for (size_t i = 0; i < n; ++i) { 25 y[j] += A[j * n + i] * x[i]; 26 } 27 } 28 } 29 30 int main(int argc, char *argv[]) 31 { 32 int n = 1000; 33 if (argc > 1) { 34 n = atoi(argv[1]); 35 } 36 std::vector<Real> A(n * n, 0.1); 37 std::vector<Real> x(n, 1.0); 38 std::vector<Real> y(n, 0.001); 39 40 // Initialize PAPI 41 int event_set = PAPI_NULL; 42 int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM}; 43 long long int counters[4]; 44 PAPI_library_init(PAPI_VER_CURRENT); 45 PAPI_create_eventset(&event_set); 46 PAPI_add_events(event_set, events, 4); 47 48 // warm up 49 sgemv(A.data(), x.data(), y.data(), n); 50 51 // start PAPI measurement 52 PAPI_start(event_set); 53 54 // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and 55 // PAPI_TOT_INS slightly, neglected here) 56 const long long int t0 = PAPI_get_real_nsec(); 57 58 // run code to be measured 59 sgemv(A.data(), x.data(), y.data(), n); 60 61 // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and 62 // PAPI_TOT_INS slightly, neglected here) 63 const long long int t1 = PAPI_get_real_nsec(); 64 65 // stop PAPI and get counter values 66 PAPI_stop(event_set, counters); 67 68 // clang-format off 69 const long long total_cycles = counters[0]; // cpu cycles 70 const long long total_instructions = counters[1]; // any 71 const long long total_load_stores = counters[2]; // number of such instructions 72 const long long total_l1d_misses = counters[3]; // number of access request to cache line 73 // clang-format on 74 75 const size_t flops = 2 * n * n + n; 76 const size_t mem_ops = 2 * n * n + 2 * n; 77 const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds 78 const double IPC = static_cast<double>(total_instructions) / total_cycles; 79 const double OI = 80 static_cast<double>(flops) / (total_load_stores * sizeof(Real)); 81 const double OI_theory = 82 static_cast<double>(flops) / (mem_ops * sizeof(Real)); 83 const double float_perf = flops / twall * 1.0e-9; // Gflop/s 84 const double sum = std::accumulate(y.begin(), y.end(), 0.0); 85 86 std::cout << "Result: " << sum << '\n'; 87 std::cout << "Total cycles: " << total_cycles << '\n'; 88 std::cout << "Total instructions: " << total_instructions << '\n'; 89 std::cout << "Instructions per cycle (IPC): " << IPC << '\n'; 90 std::cout << "L1 cache size: " << L1_SIZE_KB << " KB\n"; 91 std::cout << "L2 cache size: " << L2_SIZE_KB << " KB\n"; 92 std::cout << "L3 cache size: " << L3_SIZE_KB << " KB\n"; 93 std::cout << "Total problem size: " 94 << (n * n + 2 * n) * sizeof(Real) / 1024 << " KB\n"; 95 std::cout << "Total L1 data misses: " << total_l1d_misses << '\n'; 96 std::cout << "Total load/store: " << total_load_stores 97 << " (expected: " << mem_ops << ")\n"; 98 std::cout << "Operational intensity: " << std::scientific << OI 99 << " (expected: " << OI_theory << ")\n"; 100 std::cout << "Performance [Gflop/s]: " << float_perf << '\n'; 101 std::cout << "Wall-time [micro-seconds]: " << twall * 1.0e6 << '\n'; 102 103 return 0; 104 }