"""
c4.py - Dot product micro-benchmark with Python simple loop (C4)
Usage: python3 c4.py <N> <repetitions>

Uses numpy arrays as input but computes dot product via Python for-loop.
Demonstrates Python loop overhead vs. C and vectorized implementations.
"""
import sys
import time
import numpy as np


def dp(N, A, B):
    """Simple Python loop dot product — O(N) with Python interpreter overhead per element"""
    R = 0.0
    for j in range(0, N):
        R += A[j] * B[j]
    return R


def main():
    if len(sys.argv) != 3:
        print(f"Usage: {sys.argv[0]} <N> <repetitions>")
        sys.exit(1)

    N = int(sys.argv[1])
    reps = int(sys.argv[2])

    # Initialize vectors to 1.0 (float32 as specified)
    A = np.ones(N, dtype=np.float32)
    B = np.ones(N, dtype=np.float32)

    times = []
    result = 0.0

    for i in range(reps):
        start = time.clock_gettime(time.CLOCK_MONOTONIC)
        result = dp(N, A, B)
        end = time.clock_gettime(time.CLOCK_MONOTONIC)
        times.append(end - start)

    # Mean of second half of repetitions (warmup excluded)
    half = reps // 2
    mean_time = sum(times[half:]) / (reps - half)

    # Bandwidth: 2 arrays * N * 4 bytes = 8N bytes
    bandwidth = (2.0 * N * 4) / mean_time / 1e9

    # Throughput: N muls + N adds = 2N FLOPs
    throughput = (2.0 * N) / mean_time

    print(f"N: {N}  <T>: {mean_time:.6f} sec  B: {bandwidth:.3f} GB/sec  F: {throughput:.3f} FLOP/sec")
    print(f"Result: {result} (expected: {N})")


if __name__ == "__main__":
    main()
