"""
c5.py - Dot product micro-benchmark using numpy.dot (C5)
Usage: python3 c5.py <N> <repetitions>

Uses numpy.dot — internally calls optimized BLAS, vectorized SIMD operations.
Should be orders of magnitude faster than the Python loop in c4.py.
"""
import sys
import time
import numpy as np


def dp_numpy(N, A, B):
    """numpy.dot — dispatches to optimized BLAS under the hood"""
    R = np.dot(A, B)
    return R


def main():
    if len(sys.argv) != 3:
        print(f"Usage: {sys.argv[0]} <N> <repetitions>")
        sys.exit(1)

    N = int(sys.argv[1])
    reps = int(sys.argv[2])

    # Initialize vectors to 1.0 (float32 as specified)
    A = np.ones(N, dtype=np.float32)
    B = np.ones(N, dtype=np.float32)

    times = []
    result = 0.0

    for i in range(reps):
        start = time.clock_gettime(time.CLOCK_MONOTONIC)
        result = dp_numpy(N, A, B)
        end = time.clock_gettime(time.CLOCK_MONOTONIC)
        times.append(end - start)

    # Mean of second half of repetitions (warmup excluded)
    half = reps // 2
    mean_time = sum(times[half:]) / (reps - half)

    # Bandwidth: 2 arrays * N * 4 bytes = 8N bytes
    bandwidth = (2.0 * N * 4) / mean_time / 1e9

    # Throughput: N muls + N adds = 2N FLOPs
    throughput = (2.0 * N) / mean_time

    print(f"N: {N}  <T>: {mean_time:.6f} sec  B: {bandwidth:.3f} GB/sec  F: {throughput:.3f} FLOP/sec")
    print(f"Result: {result} (expected: {N})")


if __name__ == "__main__":
    main()
