o
    QiA                     @   s   d Z ddlZddlmZ ddlm  mZ ddlZddlm	Z	 ddl
Z
ddlZG dd dejZG dd dejZd'dd	Zd
ZdZd(ddZd)ddZdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zed&krue  dS dS )*ug   
Lab 2 — HPML Spring 2026
ResNet-18 on CIFAR10: Training & Profiling (Part A)
Exercises: C1–C6, Q3
    Nc                       s*   e Zd ZdZd fdd	Zdd Z  ZS )
BasicBlock   Tc                    s   t    tj||d|ddd| _|rt|nt | _tj||ddddd| _|r0t|nt | _	t
 | _|dksE||| j krhtj||| j d|ddg}|r`|t|| j  tj
| | _d S d S )N   r   Fstridepaddingbias)r   r   )super__init__nnConv2dconv1BatchNorm2dIdentitybn1conv2bn2
Sequentialshortcut	expansionappend)selfin_chout_chr   use_bnsc	__class__ /home/ubuntu/hpml_nyu/lab2.pyr
      s   

zBasicBlock.__init__c                 C   s>   t | | |}| | |}|| |7 }t |S N)Frelur   r   r   r   r   )r   xoutr   r   r   forward&   s   
zBasicBlock.forward)r   T)__name__
__module____qualname__r   r
   r%   __classcell__r   r   r   r   r      s    r   c                       s.   e Zd Zd	 fdd	Zdd Zdd Z  ZS )
ResNet
   Tc                    s   t    d| _|| _tjddddddd| _|rtdnt | _	| j
|d|d dd| _| j
|d|d d	d| _| j
|d
|d	 d	d| _| j
|d|d d	d| _td| _td|j || _d S )N@   r   r   Fr   r   )r            i   )r   r   )r	   r
   	in_planesr   r   r   r   r   r   r   _build_grouplayer1layer2layer3layer4AdaptiveAvgPool2davgpoolLinearr   fc)r   blocklayersnum_classesr   r   r   r   r
   .   s   
zResNet.__init__c                 C   sP   |gdg|d   }g }|D ]}| || j||| j ||j | _qtj| S Nr   )r   r0   r   r   r   r   )r   r:   planesn_blocksr   stridesr;   sr   r   r   r1   @   s   
zResNet._build_groupc                 C   s^   t | | |}| |}| |}| |}| |}| |}t	
|d}| |S r=   )r!   r"   r   r   r2   r3   r4   r5   r7   torchflattenr9   )r   r#   r   r   r   r%   I   s   





zResNet.forward)r+   T)r&   r'   r(   r
   r1   r%   r)   r   r   r   r   r*   -   s    	r*   Tc                 C   s   t tg d| dS )N)r.   r.   r.   r.   r   )r*   r   rD   r   r   r   resnet18T   s   rE   )gHPs?gec]?g~jt?)gۊe?ggDio?g|?5^?./datar-   r.   c              	   C   sx   |rt t jdddt dt  t ttg}nt t  t ttg}tj	j
| |d|d}tjjj||||dS )N       )r   g      ?T)roottraindownload	transform)
batch_sizeshufflenum_workers)
transformsCompose
RandomCropRandomHorizontalFlipToTensor	Normalize
CIFAR_MEAN	CIFAR_STDtorchvisiondatasetsCIFAR10rB   utilsdata
DataLoader)	data_pathrM   rO   rJ   rL   datasetr   r   r   get_cifar10_loader^   s"   

r`   皙??Mb@?c                 C   sj   |   } | dkrtjj||||dS | dkr!tjj||||ddS | dkr.tjj|||dS td|  )	Nsgd)lrmomentumweight_decaysgd_nesterovT)re   rf   rg   nesterovadam)re   rg   zUnknown optimizer: )lowerrB   optimSGDAdam
ValueError)namemodel_paramsre   rf   rg   r   r   r   get_optimizerv   s   rr   c                 C   s(  |    d}d}d}d}d}	t }
|D ]o\}}|t |
 7 }||||}}|jdkr5tj  t }|  | |}|||}|	  |
  |jdkrXtj  |	t | 7 }	|| 7 }|d\}}||d7 }|||  7 }t }
qt|}|| d| | ||	fS )zQSingle epoch: fwd+bwd on all batches, returns (loss, acc, data_time, train_time).        r   cudar   g      Y@)rJ   timeperf_countertotyperB   rt   synchronize	zero_gradbackwardstepitemmaxsizeeqsumlen)modelloader	criterion	optimizerdevicerunning_losscorrecttotal	data_time
train_time	batch_endinputstargetst0outputsloss_	predicted	n_batchesr   r   r   train_one_epoch   s8   





r   c                 C   sz  t | j d|}t| j| j| j}t }t	| j
| }tdd  td| j d td| j
 d| j d| js>d	nd
 d|  td  tddddddddddddddddd td| jd D ]G}|jdkrztj  t }t|||||\}}	}
}|jdkrtj  t | }t|dd|dd|	dd|
dd|dd|d qnt  ||fS )z4C1: full train loop, C2: timing breakdown per epoch.rD   
<============================================================u    C1/C2: Training — z epochsz optimizer=z
  workers=z  bn=onoffz	  device=Ep>3 Loss>8Acc%>7zData(s)Train(s)>9Total(s)r   rt   >8.4f>6.2f% z>8.2f>9.2f)rE   no_batchnormrw   r`   r^   rM   rO   r   CrossEntropyLossrr   r   
parametersprintepochsrangerx   rB   rt   ry   ru   rv   r   )argsr   r   r   r   r   epocht_startr   accdtttt_totalr   r   r   run_training   s0   
6



8r   c              
   C   sx  ddl }|d ddlm} ttddd}g }g }tdd  td| j d	 td  td
dddddddddd |D ]}t 	|}t
| j| j|}	t }
td| }d\}}}td| jd D ]6}|jdkrytj  t }t||	|
||\}}}}|jdkrtj  ||7 }||7 }|t | 7 }qm|| j }|| j }|| j }|| || t|dd|dd|dd|d qE|t|}|| }td| d|| dd|| dd |jdd\}}|j||dd d!d" |j||d#d d$d" |d% |d& |d' | | |!  |j"d(d)d* |#  |j$d+d,d- td. |S )/zLC3: vary num_workers in steps of 4, measure data-loading time, plot results.r   NAgg   rH   r   r   u(    C3: I/O Optimization — Worker Sweep (z epochs each)Workersr   r   z
AvgData(s)z>11AvgTrain(s)>12zAvgTotal(s)rd   )rs   rs   rs   r   rt   z>11.3fz>12.3fz
C3.2 => Best num_workers = z (avg total: z.3fzs, avg data: zs))      )figsizezbo-r.   zData Loading)	linewidthlabelzrs--zTotal EpochrO   zAvg Time per Epoch (s)z'C3: DataLoader I/O Time vs. num_workersTg333333?)alphazc3_workers.png   )dpizPlot saved -> c3_workers.png)%
matplotlibusematplotlib.pyplotpyplotlistr   r   r   rE   rw   r`   r^   rM   r   r   rr   r   rx   rB   rt   ry   ru   rv   r   r   indexminsubplotsplot
set_xlabel
set_ylabel	set_title
set_xtickslegendgridtight_layoutsavefig)r   r   r   pltworker_listavg_data_timesavg_total_timesnwr   r   r   r   sum_dtsum_tt	sum_totalepr   r   r   r   avg_dtavg_ttavg_totbest_idxbest_nwfigax1r   r   r   run_worker_sweep   sh   

&









(





r   c                 C   s  i }dD ]}t |}t |}t| j| j| j}t	 }t
d| }td|  d| j d g }td| jd D ]E}	|jdkrIt j  t }
t|||||\}}}}|jdkrct j  t |
 }|| td|	 d	|d
d|dd|dd	 q=t|t| }|||< td|  d|dd qtdd  td| j d td  td|d dd td|d dd |d |d  }td|dd |S )z9C4: train 5 epochs on each device, report avg epoch time.)cpurt   rd   
  [z] Training z
 epochs...r   rt   z    Ep z: loss=.4fz  acc=.2fz%  time=rA   z  [z] Avg epoch time: r   r   z! C4 Summary: GPU vs CPU (workers=)z  CPU avg: r   zs/epochz  GPU avg: z  GPU speedup: z.1fr#   )rB   r   rE   rw   r`   r^   rM   rO   r   r   rr   r   r   upperr   r   rx   rt   ry   ru   rv   r   r   r   r   )r   resultsdev_namer   r   r   r   r   epoch_timesr   r   r   r   r   r   elapsedavgspeedupr   r   r   run_cpu_vs_gpu  s<   





*
r   c                 C   s*  g d}i }|D ]}t  |}t| j| j| j}t }t||	 }t
d|  d t
dddddd	dd
ddddddd
 g }	td| jd D ]M}
|jdkr\tj  t }t|||||\}}}}|jdkrvtj  t | }|	||||f t
d|
dd|dd|dd|dd|d
 qPtdd |	D | j }tdd |	D | j }tdd |	D | j }|||f||< qt
dd  t
d| j d t
d  t
dddddd	dd d	dd!d" |D ]}|| \}}}t
d|dd|dd|d#d|d$ q|S )%uB   C5: same setup, three optimizers — per-epoch stats side by side.rd   rh   rj   r   ]z  r   r   r   r   r   r   r   r   r   r   r   rt   r   r   r   r   c                 s       | ]}|d  V  qdS r   Nr   .0rr   r   r   	<genexpr>]      z+run_optimizer_comparison.<locals>.<genexpr>c                 s   r   r   Nr   r   r   r   r   r   ^  r   c                 s   r   )r.   Nr   r   r   r   r   r   _  r   r   r   z+ C5 Summary: Optimizer Comparison (workers=r   	Optimizerz<15AvgLosszAvgAcc%r   r   z>7.2fz>12.2f)rE   rw   r`   r^   rM   rO   r   r   rr   r   r   r   r   r   rx   rB   rt   ry   ru   rv   r   r   r   )r   r   	opt_namesall_resultsoptr   r   r   r   
epoch_datar   r   r   r   r   r   r   avg_lossavg_accr   latr   r   r   run_optimizer_comparison@  s@   0



2
(*r  c                 C   s  t dd|}t| j| j| j}t }td|	 }t
dd  t
d| j d| j d t
d  t
d	d
dddddddddddd	 g }td| jd D ]L}|jdkrbtj  t }t|||||\}	}
}}|jdkr|tj  t | }||	|
||f t
|d
d|	dd|
dd|dd|d	 qVtdd |D | j }tdd |D | j }t
d|dd|dd  ||fS )!zAC6: train with SGD but all batchnorm layers replaced by Identity.FrD   rd   r   r   u    C6: Without Batch Norm — z epochs (SGD, workers=r   r   r   r   r   r   r   r   r   r   r   r   rt   r   r   r   r   c                 s   r   r   r   r   r   r   r   r     r   z#run_no_batchnorm.<locals>.<genexpr>c                 s   r   r   r   r   r   r   r   r     r   z
C6 Summary => avg loss: r   z, avg acc: r   %)rE   rw   r`   r^   rM   rO   r   r   rr   r   r   r   r   rx   rB   rt   ry   ru   rv   r   r   r   )r   r   r   r   r   r   r  r   r   r   r   r   r   r   r  r  r   r   r   run_no_batchnormn  s.   
.



0r
  c                 C   sz   t dd |  D }t dd |  D }td|d td|d t dd |j D }td|  ||fS )	Nc                 s   s    | ]
}|j r| V  qd S r    )requires_gradnumelr   pr   r   r   r     s    z#count_parameters.<locals>.<genexpr>c                 s   s"    | ]}|j d ur| V  qd S r    )gradr  r  r   r   r   r     s     z[Q3] Trainable params : ,z[Q3] Params w/ grads  : c                 s   s    | ]}t |V  qd S r    )r   )r   vr   r   r   r     r   z[Q3] Optimizer states : )r   r   r   statevalues)r   r   	trainable
grad_countn_statesr   r   r   count_parameters  s   r  c                  C   s  t jdd} | jddd | jdtdd | jd	td
dd | jdtdd | jdtdd | jdtdg dd | jdddd | jdtdg ddd |  }t|jr]tj	 r]dnd}t
d|  t
d td!d" t  D d# |jd$v rt||\}}t|| |jd%v rt||}t
d&| d' |jd(v rt
d)d*  t
d+ t
d*  t| |jd,v rt
d)d*  t
d- t
d*  t|| |jd.v rt|| d S d S )/Nu   Lab2 — ResNet-18 CIFAR10)descriptionz--cuda
store_true)actionz--data_pathrF   )rx   defaultz--num_workersrH   zdefault=4 (optimal from C3))rx   r  helpz--batch_sizer-   z--epochsr   z--optimizerrd   r   )rx   r  choicesz--no_batchnormzC6: disable batch norm layers)r  r  z--taskrJ   )rJ   c3c4c5c6allzwhich experiment to run)rx   r  r  r  rt   r   zDevice: zResNet-18 params: c                 s   s    | ]}|  V  qd S r    )r  r  r   r   r   r     r   zmain.<locals>.<genexpr>r  )rJ   r"  )r  r"  z
=> Use --num_workers z for subsequent experiments)r  r"  r   r   z C4: GPU vs CPU)r   r"  z C5: Optimizer Comparison)r!  r"  )argparseArgumentParseradd_argumentstrint
parse_argsrB   r   rt   is_availabler   r   rE   r   taskr   r  r   r   r  r
  )parserr   r   r   r   r   r   r   r   main  sR   


$









r,  __main__)T)rF   r-   r.   T)ra   rb   rc   )__doc__rB   torch.nnr   torch.nn.functional
functionalr!   rX   torchvision.transformsrP   r#  ru   Moduler   r*   rE   rV   rW   r`   rr   r   r   r   r   r  r
  r  r,  r&   r   r   r   r   <module>   s4    
'

-!F).#-
