benchmark-fortran-c

Benchmark code for Fortran and C GEMV test kernels
git clone https://git.0xfab.ch/benchmark-fortran-c.git
Log | Files | Refs

main.c (3599B)


      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 
      4 #include "papi.h"
      5 
      6 // Model name: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz
      7 // L1d cache:  32K
      8 // L1i cache:  32K
      9 // L2 cache:   256K
     10 // L3 cache:   12288K
     11 #define L1_SIZE_KB 32
     12 #define L2_SIZE_KB 256
     13 #define L3_SIZE_KB 12288
     14 
     15 #ifndef KERNEL
     16 #define KERNEL sgemv_alias
     17 #endif /* KERNEL */
     18 
     19 extern void KERNEL(const float *A, const float *x, float *y, const int *n);
     20 
     21 int main(int argc, char *argv[])
     22 {
     23     int n = 10000;
     24     if (argc > 1) {
     25         n = atoi(argv[1]);
     26     }
     27 
     28     float *A = (float *)malloc(n * n * sizeof(float));
     29     float *x = (float *)malloc(n * sizeof(float));
     30     float *y = (float *)malloc(n * sizeof(float));
     31 
     32     for (int i = 0; i < n * n; i++) {
     33         A[i] = 0.1f;
     34     }
     35     for (int i = 0; i < n; i++) {
     36         x[i] = 1.0f;
     37         y[i] = 0.001f;
     38     }
     39 
     40     // Initialize PAPI
     41     int event_set = PAPI_NULL;
     42     int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM};
     43     long long int counters[4];
     44     PAPI_library_init(PAPI_VER_CURRENT);
     45     PAPI_create_eventset(&event_set);
     46     PAPI_add_events(event_set, events, 4);
     47 
     48     // warm up
     49     KERNEL(A, x, y, &n);
     50 
     51     // start PAPI measurement
     52     PAPI_start(event_set);
     53 
     54     // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
     55     // PAPI_TOT_INS slightly, neglected here)
     56     const long long int t0 = PAPI_get_real_nsec();
     57 
     58     // run code to be measured
     59     KERNEL(A, x, y, &n);
     60 
     61     // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and
     62     // PAPI_TOT_INS slightly, neglected here)
     63     const long long int t1 = PAPI_get_real_nsec();
     64 
     65     // stop PAPI and get counter values
     66     PAPI_stop(event_set, counters);
     67 
     68     // clang-format off
     69     const long long total_cycles = counters[0];       // cpu cycles
     70     const long long total_instructions = counters[1]; // any instruction
     71     const long long total_load_stores = counters[2];  // load/store instructions
     72     const long long total_l1d_misses = counters[3];   // L1d misses
     73     // clang-format on
     74 
     75     const long long flops = 2 * n * n + n;
     76     const long long mem_ops = 2 * n * n + 2 * n;
     77     const double twall = ((double)t1 - t0) * 1.0e-9; // seconds
     78     const double IPC = (double)total_instructions / total_cycles;
     79     const double OI = (double)flops / (total_load_stores * sizeof(float));
     80     const double OI_theory = (double)flops / (mem_ops * sizeof(float));
     81     const double float_perf = flops / twall * 1.0e-9; // GFlop/s
     82     double sum = 0.0;
     83     for (long long int i = 0; i < n; i++) {
     84         sum += y[i];
     85     }
     86 
     87     free(A);
     88     free(x);
     89     free(y);
     90 
     91     // clang-format off
     92     printf("Result:                       %.1f\n", sum);
     93     printf("Total cycles:                 %lld\n", total_cycles);
     94     printf("Total instructions:           %lld\n", total_instructions);
     95     printf("Instructions per cycle (IPC): %.2f\n", IPC);
     96     printf("L1 cache size:                %d KB\n", L1_SIZE_KB);
     97     printf("L2 cache size:                %d KB\n", L2_SIZE_KB);
     98     printf("L3 cache size:                %d KB\n", L3_SIZE_KB);
     99     printf("Total problem size:           %ld KB\n",
    100            (n * n + 2 * n) * sizeof(float) / 1024);
    101     printf("Total L1 data misses:         %lld\n", total_l1d_misses);
    102     printf("Total load/store:             %lld (expected: %lld)\n",
    103            total_load_stores, mem_ops);
    104     printf("Operational intensity:        %e (expected: %e)\n", OI, OI_theory);
    105     printf("Performance [GFlop/s]:        %e\n", float_perf);
    106     printf("Wall-time   [micro-seconds]:  %e\n", twall * 1.0e6);
    107     // clang-format on
    108 
    109     return 0;
    110 }