/*
 * c3.c - Dot product micro-benchmark using BLAS cblas_sdot (C3)
 * Usage: ./c3 <N> <repetitions>
 * Compiles: gcc -O3 -Wall -o c3 c3.c -lcblas
 *
 * Uses optimized BLAS library (cblas_sdot) instead of hand-written loop.
 * On HPC: use #include <mkl_cblas.h> and link with MKL.
 * Here: using system CBLAS (same API, OpenBLAS/reference BLAS backend).
 */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cblas.h>

/* BLAS-based dot product: single call to optimized library */
float bdp(long N, float *pA, float *pB) {
    float R = cblas_sdot(N, pA, 1, pB, 1);
    return R;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <N> <repetitions>\n", argv[0]);
        return 1;
    }

    long N = atol(argv[1]);
    int reps = atoi(argv[2]);

    float *pA = (float *)malloc(N * sizeof(float));
    float *pB = (float *)malloc(N * sizeof(float));
    if (!pA || !pB) {
        fprintf(stderr, "Memory allocation failed for N=%ld\n", N);
        return 1;
    }
    for (long i = 0; i < N; i++) {
        pA[i] = 1.0f;
        pB[i] = 1.0f;
    }

    double *times = (double *)malloc(reps * sizeof(double));
    struct timespec start, end;
    float result = 0.0f;

    for (int i = 0; i < reps; i++) {
        clock_gettime(CLOCK_MONOTONIC, &start);
        result = bdp(N, pA, pB);
        clock_gettime(CLOCK_MONOTONIC, &end);
        times[i] = (end.tv_sec - start.tv_sec) +
                   (end.tv_nsec - start.tv_nsec) / 1e9;
    }

    /* Mean of second half only (warmup excluded) */
    int half = reps / 2;
    double mean_time = 0.0;
    for (int i = half; i < reps; i++) {
        mean_time += times[i];
    }
    mean_time /= (reps - half);

    double bandwidth = (2.0 * N * sizeof(float)) / mean_time / 1e9;
    double throughput = (2.0 * N) / mean_time;

    printf("N: %ld  <T>: %f sec  B: %.3f GB/sec  F: %.3f FLOP/sec\n",
           N, mean_time, bandwidth, throughput);
    printf("Result: %f (expected: %ld)\n", result, N);

    free(pA);
    free(pB);
    free(times);
    return 0;
}
