sgemv.cpp - cs205-lecture-examples - Example codes used during Harvard CS205 lectures

sgemv.cpp (3799B)
      1 #include "papi.h"
      2 #include <cstdlib>
      3 #include <iostream>
      4 #include <numeric>
      5 #include <vector>
      6 
      7 // Broadwell CPU on cluster, you can get one with
      8 //   salloc -N1 -c32 -t 01:00:00
      9 //
     10 // Model name: Intel(R) Xeon(R) CPU E5-2683 v4 @ 2.10GHz
     11 // L1d cache:  32K
     12 // L1i cache:  32K
     13 // L2 cache:   256K
     14 // L3 cache:   40960K
     15 #define L1_SIZE_KB 32
     16 #define L2_SIZE_KB 256
     17 #define L3_SIZE_KB 40960
     18 
     19 typedef float Real;
     20 
     21 void sgemv(const Real *A, const Real *x, Real *__restrict__ y, const size_t n)
     22 {
     23     for (size_t j = 0; j < n; ++j) {
     24         for (size_t i = 0; i < n; ++i) {
     25             y[j] += A[j * n + i] * x[i];
     26         }
     27     }
     28 }
     29 
     30 int main(int argc, char *argv[])
     31 {
     32     int n = 1000;
     33     if (argc > 1) {
     34         n = atoi(argv[1]);
     35     }
     36     std::vector<Real> A(n * n, 0.1);
     37     std::vector<Real> x(n, 1.0);
     38     std::vector<Real> y(n, 0.001);
     39 
     40     // Initialize PAPI
     41     int event_set = PAPI_NULL;
     42     int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM};
     43     long long int counters[4];
     44     PAPI_library_init(PAPI_VER_CURRENT);
     45     PAPI_create_eventset(&event_set);
     46     PAPI_add_events(event_set, events, 4);
     47 
     48     // warm up
     49     sgemv(A.data(), x.data(), y.data(), n);
     50 
     51     // start PAPI measurement
     52     PAPI_start(event_set);
     53 
     54     // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
     55     // PAPI_TOT_INS slightly, neglected here)
     56     const long long int t0 = PAPI_get_real_nsec();
     57 
     58     // run code to be measured
     59     sgemv(A.data(), x.data(), y.data(), n);
     60 
     61     // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
     62     // PAPI_TOT_INS slightly, neglected here)
     63     const long long int t1 = PAPI_get_real_nsec();
     64 
     65     // stop PAPI and get counter values
     66     PAPI_stop(event_set, counters);
     67 
     68     // clang-format off
     69     const long long total_cycles = counters[0];       // cpu cycles
     70     const long long total_instructions = counters[1]; // any
     71     const long long total_load_stores = counters[2];  // number of such instructions
     72     const long long total_l1d_misses = counters[3];   // number of access request to cache line
     73     // clang-format on
     74 
     75     const size_t flops = 2 * n * n + n;
     76     const size_t mem_ops = 2 * n * n + 2 * n;
     77     const double twall = (static_cast<double>(t1) - t0) * 1.0e-9; // seconds
     78     const double IPC = static_cast<double>(total_instructions) / total_cycles;
     79     const double OI =
     80         static_cast<double>(flops) / (total_load_stores * sizeof(Real));
     81     const double OI_theory =
     82         static_cast<double>(flops) / (mem_ops * sizeof(Real));
     83     const double float_perf = flops / twall * 1.0e-9; // Gflop/s
     84     const double sum = std::accumulate(y.begin(), y.end(), 0.0);
     85 
     86     std::cout << "Result:                       " << sum << '\n';
     87     std::cout << "Total cycles:                 " << total_cycles << '\n';
     88     std::cout << "Total instructions:           " << total_instructions << '\n';
     89     std::cout << "Instructions per cycle (IPC): " << IPC << '\n';
     90     std::cout << "L1 cache size:                " << L1_SIZE_KB << " KB\n";
     91     std::cout << "L2 cache size:                " << L2_SIZE_KB << " KB\n";
     92     std::cout << "L3 cache size:                " << L3_SIZE_KB << " KB\n";
     93     std::cout << "Total problem size:           "
     94               << (n * n + 2 * n) * sizeof(Real) / 1024 << " KB\n";
     95     std::cout << "Total L1 data misses:         " << total_l1d_misses << '\n';
     96     std::cout << "Total load/store:             " << total_load_stores
     97               << " (expected: " << mem_ops << ")\n";
     98     std::cout << "Operational intensity:        " << std::scientific << OI
     99               << " (expected: " << OI_theory << ")\n";
    100     std::cout << "Performance [Gflop/s]:        " << float_perf << '\n';
    101     std::cout << "Wall-time   [micro-seconds]:  " << twall * 1.0e6 << '\n';
    102 
    103     return 0;
    104 }
	cs205-lecture-examples Example codes used during Harvard CS205 lectures
	git clone https://git.0xfab.ch/cs205-lecture-examples.git
	Log \| Files \| Refs \| README \| LICENSE