/*
 * c2.c - Dot product micro-benchmark with 4x loop unrolling (C2)
 * Usage: ./c2 <N> <repetitions>
 * Compiles: gcc -O3 -Wall -o c2 c2.c
 *
 * Same benchmark as dp1.c but with manually unrolled loop (stride 4)
 * Reduces loop overhead, enables better instruction pipelining
 */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

/* 4x unrolled dot product: reduces branch overhead, better ILP */
float dpunroll(long N, float *pA, float *pB) {
    float R = 0.0;
    int j;
    for (j = 0; j < N; j += 4)
        R += pA[j] * pB[j] + pA[j+1] * pB[j+1] +
             pA[j+2] * pB[j+2] + pA[j+3] * pB[j+3];
    return R;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <N> <repetitions>\n", argv[0]);
        return 1;
    }

    long N = atol(argv[1]);
    int reps = atoi(argv[2]);

    float *pA = (float *)malloc(N * sizeof(float));
    float *pB = (float *)malloc(N * sizeof(float));
    if (!pA || !pB) {
        fprintf(stderr, "Memory allocation failed for N=%ld\n", N);
        return 1;
    }
    for (long i = 0; i < N; i++) {
        pA[i] = 1.0f;
        pB[i] = 1.0f;
    }

    double *times = (double *)malloc(reps * sizeof(double));
    struct timespec start, end;
    float result = 0.0f;

    for (int i = 0; i < reps; i++) {
        clock_gettime(CLOCK_MONOTONIC, &start);
        result = dpunroll(N, pA, pB);
        clock_gettime(CLOCK_MONOTONIC, &end);
        times[i] = (end.tv_sec - start.tv_sec) +
                   (end.tv_nsec - start.tv_nsec) / 1e9;
    }

    /* Mean of second half only (warmup excluded) */
    int half = reps / 2;
    double mean_time = 0.0;
    for (int i = half; i < reps; i++) {
        mean_time += times[i];
    }
    mean_time /= (reps - half);

    double bandwidth = (2.0 * N * sizeof(float)) / mean_time / 1e9;
    double throughput = (2.0 * N) / mean_time;

    printf("N: %ld  <T>: %f sec  B: %.3f GB/sec  F: %.3f FLOP/sec\n",
           N, mean_time, bandwidth, throughput);
    printf("Result: %f (expected: %ld)\n", result, N);

    free(pA);
    free(pB);
    free(times);
    return 0;
}
