benchmark-fortran-c

Benchmark code for Fortran and C GEMV test kernels
git clone https://git.0xfab.ch/benchmark-fortran-c.git
Log | Files | Refs

commit d6047242df53f4cc704d0fde4b4df5df365efa6e
Author: Fabian Wermelinger <info@0xfab.ch>
Date:   Fri, 27 Dec 2024 14:11:21 +0100

Add Fortran/C benchmark code and kernels

Diffstat:
A.gitignore | 3+++
AMakefile | 35+++++++++++++++++++++++++++++++++++
Ac/sgemv/Makefile | 20++++++++++++++++++++
Ac/sgemv/sgemv.c | 36++++++++++++++++++++++++++++++++++++
Afort/blas/.gitignore | 3+++
Afort/blas/README | 1+
Afort/blas/blas-3.12.0.tgz | 0
Afort/sgemv/Makefile | 20++++++++++++++++++++
Afort/sgemv/sgemv.f | 24++++++++++++++++++++++++
Amain.c | 110+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
10 files changed, 252 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,3 @@ +*.o +c_* +fort_* diff --git a/Makefile b/Makefile @@ -0,0 +1,35 @@ +CC = gcc +CFLAGS = -g -Wall -Wextra -Wpedantic +LIBS = -lpapi -lm + +.PHONY: clean + +all: main.c + # fortran + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O0 $< $(LIBS) fort/sgemv/sgemv_O0.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O1 $< $(LIBS) fort/sgemv/sgemv_O1.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O2 $< $(LIBS) fort/sgemv/sgemv_O2.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_ -o fort_O3 $< $(LIBS) fort/sgemv/sgemv_O3.o + + # c + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O0 $< $(LIBS) c/sgemv/sgemv_O0.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O0 $< $(LIBS) c/sgemv/sgemv_O0.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O0 $< $(LIBS) c/sgemv/sgemv_O0.o + + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O1 $< $(LIBS) c/sgemv/sgemv_O1.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O1 $< $(LIBS) c/sgemv/sgemv_O1.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O1 $< $(LIBS) c/sgemv/sgemv_O1.o + + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O2 $< $(LIBS) c/sgemv/sgemv_O2.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O2 $< $(LIBS) c/sgemv/sgemv_O2.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O2 $< $(LIBS) c/sgemv/sgemv_O2.o + + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_noalias -o c_noalias_O3 $< $(LIBS) c/sgemv/sgemv_O3.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_alias -o c_alias_O3 $< $(LIBS) c/sgemv/sgemv_O3.o + $(CC) -O2 $(CFLAGS) -DKERNEL=sgemv_temporary -o c_temporary_O3 $< $(LIBS) c/sgemv/sgemv_O3.o + +main: main.c + $(CC) -O2 $(CFLAGS) -DKERNEL=$(kernel) -o $@ $< $(LIBS) $(kernel_lib) + +clean: + rm -f main fort_* c_* diff --git a/c/sgemv/Makefile b/c/sgemv/Makefile @@ -0,0 +1,20 @@ +CC = gcc +CFLAGS = -g -c +.PHONY: clean + +all: sgemv_O0.o sgemv_O1.o sgemv_O2.o sgemv_O3.o + +sgemv_O0.o: sgemv.c + $(CC) -O0 $(CFLAGS) -o $@ $< + +sgemv_O1.o : sgemv.c + $(CC) -O1 $(CFLAGS) -o $@ $< + +sgemv_O2.o : sgemv.c + $(CC) -O2 $(CFLAGS) -o $@ $< + +sgemv_O3.o : sgemv.c + $(CC) -O3 $(CFLAGS) -o $@ $< + +clean: + rm -f sgemv_O* diff --git a/c/sgemv/sgemv.c b/c/sgemv/sgemv.c @@ -0,0 +1,36 @@ +#include <stdlib.h> + +void sgemv_noalias(const float *A, + const float *x, + float *__restrict__ y, + const int *n) +{ + const int N = *n; + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + y[j] += A[j * N + i] * x[i]; + } + } +} + +void sgemv_alias(const float *A, const float *x, float *y, const int *n) +{ + const int N = *n; + for (int j = 0; j < N; ++j) { + for (int i = 0; i < N; ++i) { + y[j] += A[j * N + i] * x[i]; + } + } +} + +void sgemv_temporary(const float *A, const float *x, float *y, const int *n) +{ + const int N = *n; + for (int j = 0; j < N; ++j) { + float temp = 0.0f; + for (int i = 0; i < N; ++i) { + temp += A[j * N + i] * x[i]; + } + y[j] += temp; + } +} diff --git a/fort/blas/.gitignore b/fort/blas/.gitignore @@ -0,0 +1,3 @@ +* +!README +!blas-3.12.0.tgz diff --git a/fort/blas/README b/fort/blas/README @@ -0,0 +1 @@ +https://netlib.org/blas/#_reference_blas_version_3_12_0 diff --git a/fort/blas/blas-3.12.0.tgz b/fort/blas/blas-3.12.0.tgz Binary files differ. diff --git a/fort/sgemv/Makefile b/fort/sgemv/Makefile @@ -0,0 +1,20 @@ +FC = gfortran +FFLAGS = -cpp -g -c -DTRANSPOSE +.PHONY: clean + +all: sgemv_O0.o sgemv_O1.o sgemv_O2.o sgemv_O3.o + +sgemv_O0.o: sgemv.f + $(FC) -O0 $(FFLAGS) -o $@ $< + +sgemv_O1.o : sgemv.f + $(FC) -O1 $(FFLAGS) -o $@ $< + +sgemv_O2.o : sgemv.f + $(FC) -O2 $(FFLAGS) -o $@ $< + +sgemv_O3.o : sgemv.f + $(FC) -O3 $(FFLAGS) -o $@ $< + +clean: + rm -f sgemv_O* diff --git a/fort/sgemv/sgemv.f b/fort/sgemv/sgemv.f @@ -0,0 +1,24 @@ + SUBROUTINE SGEMV(A,X,Y,N) + INTEGER(4) N,I,J + REAL(4) A(N,*),X(*),Y(*) + REAL(4) TEMP +#ifdef TRANSPOSE +! Form y := A^T*x + y. + DO 100 J = 1,N + TEMP = 0.0 + DO 90 I = 1,N + TEMP = TEMP + A(I,J)*X(I) + 90 CONTINUE + Y(J) = Y(J) + TEMP + 100 CONTINUE +#else +! Form y := A*x + y. + DO 60 J = 1,N + TEMP = X(J) + DO 50 I = 1,N + Y(I) = Y(I) + TEMP*A(I,J) + 50 CONTINUE + 60 CONTINUE +#endif + RETURN + END diff --git a/main.c b/main.c @@ -0,0 +1,110 @@ +#include <stdio.h> +#include <stdlib.h> + +#include "papi.h" + +// Model name: Intel(R) Core(TM) i7-8700K CPU @ 3.70GHz +// L1d cache: 32K +// L1i cache: 32K +// L2 cache: 256K +// L3 cache: 12288K +#define L1_SIZE_KB 32 +#define L2_SIZE_KB 256 +#define L3_SIZE_KB 12288 + +#ifndef KERNEL +#define KERNEL sgemv_alias +#endif /* KERNEL */ + +extern void KERNEL(const float *A, const float *x, float *y, const int *n); + +int main(int argc, char *argv[]) +{ + int n = 10000; + if (argc > 1) { + n = atoi(argv[1]); + } + + float *A = (float *)malloc(n * n * sizeof(float)); + float *x = (float *)malloc(n * sizeof(float)); + float *y = (float *)malloc(n * sizeof(float)); + + for (int i = 0; i < n * n; i++) { + A[i] = 0.1f; + } + for (int i = 0; i < n; i++) { + x[i] = 1.0f; + y[i] = 0.001f; + } + + // Initialize PAPI + int event_set = PAPI_NULL; + int events[4] = {PAPI_TOT_CYC, PAPI_TOT_INS, PAPI_LST_INS, PAPI_L1_DCM}; + long long int counters[4]; + PAPI_library_init(PAPI_VER_CURRENT); + PAPI_create_eventset(&event_set); + PAPI_add_events(event_set, events, 4); + + // warm up + KERNEL(A, x, y, &n); + + // start PAPI measurement + PAPI_start(event_set); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t0 = PAPI_get_real_nsec(); + + // run code to be measured + KERNEL(A, x, y, &n); + + // assuming no overhead to call this timer (will pollute PAPI_TOT_CYC and + // PAPI_TOT_INS slightly, neglected here) + const long long int t1 = PAPI_get_real_nsec(); + + // stop PAPI and get counter values + PAPI_stop(event_set, counters); + + // clang-format off + const long long total_cycles = counters[0]; // cpu cycles + const long long total_instructions = counters[1]; // any instruction + const long long total_load_stores = counters[2]; // load/store instructions + const long long total_l1d_misses = counters[3]; // L1d misses + // clang-format on + + const long long flops = 2 * n * n + n; + const long long mem_ops = 2 * n * n + 2 * n; + const double twall = ((double)t1 - t0) * 1.0e-9; // seconds + const double IPC = (double)total_instructions / total_cycles; + const double OI = (double)flops / (total_load_stores * sizeof(float)); + const double OI_theory = (double)flops / (mem_ops * sizeof(float)); + const double float_perf = flops / twall * 1.0e-9; // GFlop/s + double sum = 0.0; + for (long long int i = 0; i < n; i++) { + sum += y[i]; + } + + free(A); + free(x); + free(y); + + // clang-format off + printf("Result: %.1f\n", sum); + printf("Total cycles: %lld\n", total_cycles); + printf("Total instructions: %lld\n", total_instructions); + printf("Instructions per cycle (IPC): %.2f\n", IPC); + printf("L1 cache size: %d KB\n", L1_SIZE_KB); + printf("L2 cache size: %d KB\n", L2_SIZE_KB); + printf("L3 cache size: %d KB\n", L3_SIZE_KB); + printf("Total problem size: %ld KB\n", + (n * n + 2 * n) * sizeof(float) / 1024); + printf("Total L1 data misses: %lld\n", total_l1d_misses); + printf("Total load/store: %lld (expected: %lld)\n", + total_load_stores, mem_ops); + printf("Operational intensity: %e (expected: %e)\n", OI, OI_theory); + printf("Performance [GFlop/s]: %e\n", float_perf); + printf("Wall-time [micro-seconds]: %e\n", twall * 1.0e6); + // clang-format on + + return 0; +}