o
    }oi                     @   sP   d dl Z d dlZd dlm  mZ d dlmZmZ e j	j
e dddd ZdS )    N)HAVE_LINEAR_LOSS_CEfused_linear_cross_entropyzLinear loss CE is not installed)reasonc                  C   s  t j s
td t d} d}d}d}d}t j}t j||||| d}t j|||| d}t jd|||f| d	}t j	  t j
jd|d
# |d|}	t |	| }
|d}tj|
|dd}W d   n1 skw   Y  t j }t j  ddl}|  t j	  t j
jd|d
 t|||}W d   n1 sw   Y  t j }td td|d dd td|d dd td|| d dd | }| }t j||dddsJ d|  d|  ||k sJ ddS )z
    Tests fused_linear_cross_entropy against PyTorch's cross_entropy implementation, fused_linear_cross_entropy should:
        * has close output with PyTorch's cross_entropy
        * uses less memory than PyTorch's cross_entropy
    zThis test requires a GPUcuda   i   i   i  )dtypedevicer   )r   )device_typer   mean)	reductionNz
Memory usage comparison:zPyTorch implementation: i   z.2fz MBzFused implementation: zMemory savings: g{Gz?)rtolatolzLoss mismatch: PyTorch=z, Fused=zGFused implementation should use less memory than PyTorch implementation)torchr   is_availablepytestskipr   bfloat16randnrandintreset_peak_memory_statsampautocastreshapematmultFcross_entropymax_memory_allocatedempty_cachegccollectr   printfloatallcloseitem)r   
batch_size
seq_length
hidden_dim
vocab_sizer   hidden_statesweighttargetshidden_states_reshapedlogitstargets_reshapedpytorch_losspytorch_memoryr    
fused_lossfused_memory r4   W/home/ubuntu/.local/lib/python3.10/site-packages/tests/automodel/loss/test_linear_ce.pytest_fused_cross_entropy   sN   








r6   )r   r   torch.nn.functionalnn
functionalr   nemo.automodel.loss.linear_cer   r   markskipifr6   r4   r4   r4   r5   <module>   s   