o
    ίi]                  &   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZ d dlZd dlmZ d dlm  mZ d dlmZmZ d dlmZ d dlmZmZ d d	l m!Z! d d
l"m#Z# d dl$m%Z% d dl&m'Z' ej(j)dddddej*de+de+ddfddZ,e,j-dej*de+de+ddfddZ.G dd dej/j0Z1da2dej*de+dej*fddZ3g dZ4ej5dej6dej*fdd Z7dcd!ej*dee+ef fd"d#Z8d$ej*ddfd%d&Z9ddd'd$ej*d(ee: d)e;d*eej* d+eej* dej*fd,d-Z<d.e:dee: fd/d0Z=e> 			1			2	ddd3e+d4ej*d5ej*d6ej*fd7d8Z?e> 	dcd3e+d4ej*d5ej*d6ej*d*eej* d+eej* d9eej* d:e:d;e:d<e@d=e@d>e;d?e:d@e:dAe;dBeej* dCe;ddf$dDdEZAdFdG ZBG dHdI dIe	ZCG dJdK dKe!ZDdLeejE dMejEddfdNdOZFG dPdQ dQZGeDZHdRZIdSZJdTZKG dUdV dVejEZLG dWdX dXejEZMG dYdZ dZejEZNG d[d\ d\eNZOG d]d^ d^ejEZPded_d`ZQdedadbZRdS )f    N)defaultdict)Enum)Path)AnyDictListOptionalTuple)CheckpointImplcheckpoint_wrapper)normalize_function)
SDPBackendsdpa_kernel)TorchDispatchMode)tree_map)ModuleTracker)fmhaztorchprobe::log )mutates_argsdevice_typesxnameuidreturnc                 C      d S Nr   r   r   r   r   r   >/home/ubuntu/.local/lib/python3.10/site-packages/core/probe.py_log&      r   c                 C   r   r   r   r   r   r   r   	_log_fake+   r   r    c                   @   s8   e Zd ZedejdefddZedejfddZdS )		_LogStatsr   r   c                 C   s.   t t }tjj||| || _|| _|S r   )	struuiduuid4torchops
torchprobelogr   r   )ctxr   r   r   r   r   r   forward1   s
   z_LogStats.forwardgradc                 C   s$   t jj|| j d| j |d fS )Nz.g)r%   r&   r'   r(   r   r   )r)   r+   r   r   r   backward9   s   z_LogStats.backwardN)	__name__
__module____qualname__staticmethodr%   Tensorr"   r*   r,   r   r   r   r   r!   0   s
    r!   Fc                 C   s   t s| S t| |S r   )_PROBING_ENABLEDr!   apply)r   r   r   r   r   	log_statsB   s   r4   )gHz>gư>gh㈵>g-C6?gMbP?g{Gz?g?g?g333333?      ?gffffff?g?gffffff?gGz?g+?gH.?gwJ?g!?gP?devicec                 C   s   t jt| |dS )Nr6   dtype)r%   tensor	QUANTILESr7   r   r   r   _get_quantiles_   s   r;   x_c                 C   s   | j tjtjtjtjfvri S |  }|r|| tdk  }|j tjur)| }| }t|j	|j }|
 }| }t| j|||| | d  
 || | d  
 |
 | | t|d d |d	S )Ninf      i   )	shapemeanstdskewkurtosisabs.meanmaxmin	quantiles)r8   r%   floatdoublefloat16bfloat16flattenabsr;   r6   rA   rB   tupler@   rF   rG   quantile)r<   
remove_infr   xabsrH   rA   rB   r   r   r   
_get_statsd   s*   rS   logitsc                 C   sB   | j dksJ tj | d d d d d d || | d d f< d S )Nr?      )ndimmathr=   )rT   q_idxq_lenkv_lenr   r   r   _mask_attn_causal_inplace   s   4r[   )cu_seqlens_qcu_seqlens_krX   causalr\   r]   c                C   s  | j tju sJ |d ur|d usJ | jdksJ | j| }| }g }dg| jd  }d}	tt||dd  ||dd  D ]V\}
\}}}}t||D ]}|
||< qN|	t	|k r||	 |k r|
|
 |rt| d d d d |	|	d ||f ||	 | || ||  |	d7 }	|	t	|k r||	 |k saqAtj|| jdd d d d d f tj|| jdd d d d d f k}tj | || < |	t	|ksJ | S |rtt	|D ]!}	t| d d d d |	|	d d d f ||	 | jd | jd  q| S )	Nr?   r   rU   )r6      r>   )r8   r%   float32rV   r@   tolist	enumerateziprangelenappendr[   r9   r6   rW   r=   	expand_as)rT   rX   r^   r\   r]   qsks	q_batchid	k_batchidq_idx_ibidq0q1k0k1kmask_outr   r   r   _mask_attn_logits   sP   2

"
	"rv   num_queriesc                 C   s   t td| td| d S )Nr   rU      )listrf   rF   )rw   r   r   r   _attn_queries_subset   s   rz           Tpathquerykeyvaluec              
   K   s4  |	d u rd|j d d  }	|d us|d us|dks|rBt|j t|j t|j |d ur.|j nd |||	t| d| j| d< d S t|j d }|d d d d |f |dd |	 }t| ||d}| 	d}|
d|tj kt|}|| d }| | d	| | j| d
|dd d S )NrU   r`   r5   r{   )query.shape	key.shapevalue.shape	attn_mask	dropout_p	is_causalscale
unk_kwargs::attnr_   )r^   ::attn_entropy::attn_logitsTrQ   )r@   rO   ry   keysstorerz   	transposerv   rI   softmaxlog_softmaxwhererW   r=   r%   
zeros_likesum
log_tensor)prober|   r}   r~   r   r   	attn_biasr   r   r   compute_log_sumexpreturn_debug_maskkwargsquery_srT   pmasked_logsoftentropyr   r   r   _compute_attn_stats_sdpa   s.   

&
r   	seqused_kmax_seqlen_qmax_seqlen_kr   softmax_scaler   window_leftwindow_rightreturn_softmaxblock_tablesunpadded_lsec                 C   sb  |d us|
dks|dks|dks|d ur,t |jt |jt |jdd| j| d< d S |d urH|jdks:J |j|d  |d  |d  }}}|jdksRJ |jt|jd }|d d |f dd	|dd	d
d | }t| ||||d}| d
}
|	d

|tj kt|}|
| d
 }| | d| | j| d|dd d S )Nr{   r   flash)r   r   r   opr   r>   r?   rU   ra   r`   r_   )r\   r]   r^   r   r   Tr   )rO   r@   r   rV   rz   r   rv   rI   r   r   r   rW   r=   r%   r   r   r   )r   r|   r}   r~   r   r\   r]   r   r   r   r   r   r   r   r   r   r   r   r   rT   r   r   r   r   r   _compute_attn_stats_flash   sH   
r   c                 C   s   t | tjs| S |  S r   )
isinstancer%   r1   rc   )r   r   r   r   _tensors_to_python'  s   r   c                   @   s   e Zd ZdZdZdZdS )LinearBwTyperU   ra   r>   N)r-   r.   r/   DWDXUNKNOWNr   r   r   r   r   .  s    r   c                       s   e Zd Zddejdee ddfddZ fddZd fd	d
Z	dd Z
dddZdedejdeeef fddZdedejddfddZdddZ  ZS )
AutoProbeDNmodule
write_filer   c                 C   sr   |d urt |nd | _d | _t|| _t | _tt| _	i | _
i | _i | _d | _d| _tttjdd| _d S )NFPROBE_VERBOSE0)r   r   write_tensors_tmpdirTorchCompileDisablercompile_disablerr   mod_trackerr   intcount_per_pathr   linear_datauid_to_pathmetadataenabledboolosenvirongetverbose)selfr   r   r   r   r   __init__5  s   

zAutoProbeD.__init__c                    s:   | j rJ d| j  | j  t   d| _ da| S )NzEntered probe twiceT)r   r   	__enter__r   superr2   r   	__class__r   r   r   B  s   


zAutoProbeD.__enter__c                    sH   | j sJ dt j|  | jj|  | jj|  |   dad| _ d S )Nz!Exiting probe without entering itF)r   r   __exit__r   r   _flush_and_clearr2   )r   argsr   r   r   r   M  s   
zAutoProbeD.__exit__c                 C   s\   | j d ur,| j jjdd | j j| j j dtt d d   | _| jjdd d S d S )NTexist_okz-tmp-   )r   parentmkdirr   r"   r#   r$   r   r   r   r   r   _setup_tensors_loggingW  s   
 z!AutoProbeD._setup_tensors_loggingc                 C   sD  | j d ur2tt| j}| j d}t|| jdtd| |	d W d    n1 s-w   Y  | j
d ur| j d us>J | j
j| j j d }|jdd d}d	| jv rad	t| jd	 d
}|dksk||  r~tt|| d}| d| }|| }| rJ | j
| d | _
| j  | j  | j  d S )Nara   )datametaversionrH   
z-dumpTr    it010zv*v)r   r   r   r   openjsondumpr   r:   writer   r   r   r   r   existsrg   ry   globrenameclearr   r   )r   	dump_datafddump_dirdir_name	num_filesr   r   r   r   `  s<   
	



zAutoProbeD._flush_and_clearr|   outc                    s   dt dtf fdd}dt dtf fdd}|jv r1||r(|tjfS ||r1|tjfS jjD ]}|jvr=q5||rH|tjf  S ||rS|tjf  S q5|tjfS )z
        We are in the BW pass, and process a GEMM.
        Let's figure out:
        (1) The path for the FW pass (might differ in case of ModuleTracker bug)
        (2) The type of BW pass (eg `dw` or `dx`)
        r|   r   c                    sJ   j |  \}}}}}j|d |d fko$t| d d dd df S )NrU   r   r?   r   r@   r%   allcloser|   in_shapew_shape	out_shapeinput_sm	weight_smr   r   r   r   r   _is_path_correct_dw  s   z>AutoProbeD._find_bw_path_and_type.<locals>._is_path_correct_dwc                    s>   j |  \}}}}}j|kot| d d dd df S )NrU   r?   r   r   r   r   r   _is_path_correct_dx  s   *z>AutoProbeD._find_bw_path_and_type.<locals>._is_path_correct_dx)	r"   r   r   r   r   r   r   parentsr   )r   r|   r   r   r   r   candidate_pathr   r   r   _find_bw_path_and_type  s    





z!AutoProbeD._find_bw_path_and_typer   r   c                 K   sT   t |fi || j|< | jd ur(|dddd}t|| j| d  d S d S )N::__/r   z.pkl)rS   r   r   replacer%   save)r   r   r   r   	name_safer   r   r   r     s
   
zAutoProbeD.log_tensorr   c                 C   s  |r|ni }d }| j jD ]}|dkrq|d u st|t|kr!|}q|d u r(d}|dd}||i |}|jtjjjtjjj	fv r| j j
s|jtjjjkrX|d d \}}	}
n|jtjjj	ksbJ |d d \}	}
| | d|	 | | d|
 | | d| |	j|
j|j|	d d	d d	f  |
d d	d d	f j f| j|< n|jtjjj	kr| |||\}}||kr| jrtd
| d|  |}|tjkr| | d| n||tjkr| | d| | | d|d  nb|jtjjjtjjjfv rt|||dd\}}t| |fi | n@|jtjjjkr4t|||dd\}}t| |fi | n#|jtjjj krW|d }| j!"||}| | d|d  |d  | jrnt| j j
rcdnd d| d|  |S )NGlobalz._checkpoint_wrapped_moduler   r>   ra   z::inz::wz::outr?   zE: Fixing path `z` -> `z::w.gz::in.gz::out.gr   T)r   r   normalize_to_only_use_kwargsr   rU   z[BW]z[FW]z `z`: )#r   r   rg   r   _overloadpacketr%   r&   atenaddmmmmis_bwr   r@   cloneTr   r   r   printr   r   r   #_scaled_dot_product_flash_attention#_scaled_dot_product_cudnn_attentionr   r   r   r   FwOpOPERATORr   r'   r(   r   
setdefault)r   functypesr   r   r|   r   r   _biasinputweightnew_pathbwtype_r   r   r   r   __torch_dispatch__  sv   



&zAutoProbeD.__torch_dispatch__r   r   N)r   N)r-   r.   r/   nnModuler   r"   r   r   r   r   r   r%   r1   r	   r   r   r   r  __classcell__r   r   r   r   r   4  s    

	 

$r   r   r   c                 C   s2   |j d ur
| | | D ]}t| |d qd S )N)r   )_compiled_call_implrh   children_find_all_submodules_compiled)r   r   cr   r   r   r    s
   

r  c                   @   s4   e Zd ZdejddfddZd
ddZd
dd	ZdS )r   r   r   Nc                 C   s,   || _ g | _g | _tj | _dtjj_	d S )NF)
r   submodules_compiledcompiled_call_implr%   compilerdisabledisable_compile_dynamoconfigraise_on_ctx_manager_usage)r   r   r   r   r   r     s
   zTorchCompileDisabler.__init__c                 C   sJ   | j   t| j | j dd | j D | _| j D ]}d |_q| j  d S )Nc                 S   s   g | ]}|j qS r   )r  ).0mr   r   r   
<listcomp>  s    z2TorchCompileDisabler.__enter__.<locals>.<listcomp>)r   r   r  r   r!  r  r$  r   )r   r)  r   r   r   r     s   

zTorchCompileDisabler.__enter__c                 G   s4   | j j|  t| j| jD ]\}}||_qg | _d S r   )r$  r   re   r   r!  r  )r   r   r)  c_implr   r   r   r     s   
zTorchCompileDisabler.__exit__r  )r-   r.   r/   r  r  r   r   r   r   r   r   r   r     s    
r   i   r?   ra   c                   @      e Zd Zdd ZdS )
Attention1c                 C   s.   t j }t j||||d|jd tdgS )Nr   r   r`   )r   r   "LowerTriangularFromBottomRightMaskmemory_efficient_attentionreshaper@   seqlen)r   r   r   r   r   r   r*     s   
zAttention1.forwardNr-   r.   r/   r*   r   r   r   r   r-        r-  c                   @   r,  )
Attention2c                 C   s^   t jjtgt  }|ddt |jd |jd g}t j	||||d|jd tdgS )NrU   ra   r>   r.  r   r`   )
r   r   BlockDiagonalMaskfrom_seqlensr2  bsmake_causalr1  r@   r0  )r   r   r   xrr   r   r   r*   &  s   "zAttention2.forwardNr3  r   r   r   r   r5  %  r4  r5  c                       s$   e Zd Z fddZdd Z  ZS )AttentionSDPAc                    s   t    ttt| _d S r   )r   r   r  Lineardwor   r   r   r   r   1  s   
zAttentionSDPA.__init__c                 C   s:   | dd}| t||| dd|jd tdgS NrU   ra   r   r`   )r   r>  Fscaled_dot_product_attentionr1  r@   r2  r   r   r   r   r   r*   5  s   zAttentionSDPA.forwardr-   r.   r/   r   r*   r  r   r   r   r   r;  0  s    r;  c                   @   r,  )AttentionSDPAFlashc                 C   sh   | dd}ttj | t||| dd|jd t	dgW  d    S 1 s-w   Y  d S r?  )
r   r   r   FLASH_ATTENTIONr>  r@  rA  r1  r@   r2  rB  r   r   r   r*   ?  s   $zAttentionSDPAFlash.forwardNr3  r   r   r   r   rD  >  r4  rD  c                       s&   e Zd Zd fddZdd Z  ZS )Modelr   Nc                    sv   t    ttd| _ttttttt| _tjttdd| _| j	  t
 | _t | _t | _t | _d S )N   F)bias)r   r   r  r<  r=  head
Sequentialtrunkq_projcompiler-  attn1r5  attn2r;  attnSDPArD  attnSDPAflashr   r   r   r   r   J  s   



zModel.__init__c                 C   st   |j d td d}}}| ||t||g}| || | | | | | }t	|d}| 
| |S )Nr   @   	attns_out)r@   r=  rL  r1  r2  rN  rO  rP  rQ  r4   rI  rK  )r   r   BnHeadsDr   r   r   r*   X  s
   (
zModel.forwardr  rC  r   r   r   r   rF  I  s    rF  c                  C   s   g d} g d}t jj| | }tjddt| t|gtjdd}|j	|j
|j|jd}| }t|tt|j
d d|jj|jjd || |k  sRJ d S )	N)rU   rU         )ra   ra   rW     rU   cuda)r8   r6   ra   T)r^   r\   r]   )r   r   BlockDiagonalCausalMaskr7  make_causal_from_bottomrightr%   randnr   rb   materializer@   r8   r6   r  rv   ry   rf   	q_seqinfoseqstart	k_seqinfoallitem)q_seqlen	kv_seqlenr   rT   rH  logits_maskedr   r   r   test_masking`  s(   rg  c               	   C   s"  t dtjd} tjtttgfi | }t }t|j	t
jdd|_	|jdi |  |  tjj| dd}t|d}tdD ]}t }td	|  |d d
kr]|| d|i|_||}t|}|| |d d
kr|jsvJ tt|j  dD ]}	|	|jv sJ d|	 dqd|j	jfd|j	jjfd|fd|j jjfd|fd|ffD ]*\}	}
|	|jv sJ d|	 dt!|j|	 d |
" # $ sJ d|	 dq|j% D ]\}	}d|v rt&'|d ( sJ d|	 q|)  |*  W d    n	1 s	w   Y  q?d S )NrZ  r7   F)checkpoint_implpreserve_rng_stater{   )lrz./probe.jsonr?   z########### STEP rU   r   )zModel::attns_outzModel::attns_out.gzModel.attn1::attn_logitszModel.attn2::attn_logitszModel.attnSDPA::attn_logitsz Model.attnSDPAflash::attn_logitsModel.head::wModel.head::w.gzModel.head::inzModel.head::in.gModel.head::outModel.head::out.gzModel.trunk.0::inzModel.trunk.1::inzMissing key: ''rk  rl  zModel.q_proj::inzModel.q_proj::w.grm  rn  rE   z' mismatcheszInf/Nan for r   )+dictr%   rK   r]  r8  r2  r=  rF  r   rI  r
   NO_REENTRANTtorM  optimSGD
parametersr   rf   
contextlib	ExitStackr	  enter_contextr   
randn_liker,   r   ry   r   r   r  r+   rL  r   rI   rN   rA   itemsrW   isfiniterc  step	zero_grad)kwr   r)  rs  r   istackygr~   r9   r   r   r   r   test_toy_modelu  sd   











r  )F)NNr{   FNTFr  )Srv  	functoolsr   rW   r   r#   collectionsr   enumr   pathlibr   typingr   r   r   r   r	   r%   torch.nnr  torch.nn.functional
functionalr@  ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr
   r   torch.fx.operator_schemasr   torch.nn.attentionr   r   torch.utils._python_dispatchr   torch.utils._pytreer   torch.utils.module_trackerr   xformers.opsr   library	custom_opr1   r"   r   register_faker    autogradFunctionr!   r2   r4   r:   cacher6   r;   rS   r[   r   r   rv   rz   no_gradr   rI   r   r   r   r   r  r  r   Prober=  r2  r8  r-  r5  r;  rD  rF  rg  r  r   r   r   r   <module>   s   

1,	
@ @
