o
    ¾e¦iJ  ã                   @   s¤  d dl Z d dlZd dlmZ ddlmZ ddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ d	d
lmZ d	dlmZmZmZmZmZmZmZmZmZ d	dlmZ ddlmZ e  e!¡Z"dZ#dZ$G dd„ deƒZ%dd„ Z&G dd„ deƒZ'G dd„ dej(ƒZ)G dd„ de)ƒZ*G dd„ de)ƒZ+e)e*e+dœZ,G dd„ deƒZ-G d d!„ d!eƒZ.G d"d#„ d#eƒZ/G d$d%„ d%eƒZ0G d&d'„ d'eƒZ1G d(d)„ d)eƒZ2G d*d+„ d+eƒZ3g d,¢Z4dS )-é    N)Únné   )Úinitialization)ÚCacheÚStaticCache)Ú_flash_attention_forwardÚ!flash_attn_supports_top_left_mask)ÚPreTrainedModel)Úloggingé   )ÚGemmaForCausalLM)	ÚLlamaDecoderLayerÚLlamaForQuestionAnsweringÚLlamaForSequenceClassificationÚLlamaForTokenClassificationÚ
LlamaModelÚLlamaPreTrainedModelÚLlamaRotaryEmbeddingÚapply_rotary_pos_embÚ	repeat_kv)Ú
MistralMLPé   )ÚDiffLlamaConfigzkajuma/DiffLlama-0.3B-handcutr   c                   @   ó   e Zd ZdS )ÚDiffLlamaMLPN©Ú__name__Ú
__module__Ú__qualname__© r   r   úm/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/diffllama/modular_diffllama.pyr   1   ó    r   c                 C   s   ddt  d|  ¡  S )Ngš™™™™™é?g333333ã?g333333Ó¿)ÚmathÚexp)Ú	layer_idxr   r   r    Úlambda_init_fn5   s   r%   c                   @   r   )ÚDiffLlamaRotaryEmbeddingNr   r   r   r   r    r&   9   r!   r&   c                       s¦   e Zd ZdZddededB f‡ fdd„Z					ddejd	e	ejejf d
ejdB dej
dB dedB dedej
dB de	ejejdB e	ej dB f fdd„Z‡  ZS )ÚDiffLlamaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNÚconfigr$   c                    sž  t ƒ  ¡  || _|| _|d u rt d| jj› d¡ |j| _|j	| _	|j
| _t|d| j	| j ƒ| _|j| _| j| j | _|j| _d| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j	| j| j |jd| _tj| j| j | j	|jd| _t|ƒ| _t tjd|j| jfd¡| _t tjd|j| jfd¡| _ t tjd|j| jfd¡| _!t tjd|j| jfd¡| _"tj#d| j |j$d	d
| _%d S )NzInstantiating z¹ without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Úhead_dimT)Úbiasr   )Úsizer   F)ÚepsÚelementwise_affine)&ÚsuperÚ__init__r(   r$   ÚloggerÚwarning_onceÚ	__class__r   Úattention_dropoutÚhidden_sizeÚnum_attention_headsÚ	num_headsÚgetattrr)   Únum_key_value_headsÚnum_key_value_groupsÚmax_position_embeddingsÚ	is_causalr   ÚLinearÚattention_biasÚq_projÚk_projÚv_projÚo_projr%   Úlambda_initÚ	ParameterÚtorchÚnormalÚlambda_std_devÚ	lambda_q1Ú	lambda_k1Ú	lambda_q2Ú	lambda_k2ÚRMSNormÚrms_norm_epsÚ	groupnorm©Úselfr(   r$   ©r2   r   r    r/   @   s2   
ÿ
zDiffLlamaAttention.__init__FÚhidden_statesÚposition_embeddingsÚattention_maskÚposition_idsÚpast_key_valuesÚ	use_cacheÚcache_positionÚreturnc                 K   s:  |  ¡ \}	}
}|
}|  |¡}|  |¡}|  |¡}| |	|| j| j¡ dd¡}| |	|| j| j¡ dd¡}| |	|| j| j¡ dd¡}|\}}t	||||ƒ\}}|d urd|||dœ}| 
||| j|¡\}}t|| jƒ}t|| jƒ}tjtj|ddddd}| dddd¡}t || dd¡¡t | j¡ }|d ur|| }tjj|dtjd |j¡}tjj|| j| jd}t tj| j | j! dtjd¡ |j¡}t tj| j"| j# dtjd¡ |j¡}|| | j$ }t ||¡}tj|ddd\}}|||  }d| j$ |  %|¡ }| dd¡ &¡ }| '|	|d¡}|  (|¡}||fS )	Nr   r   ©ÚsinÚcosrW   ©Údiméÿÿÿÿr   ©r]   Údtype)ÚpÚtraining))r+   r>   r?   r@   Úviewr6   r)   Ú	transposer8   r   Úupdater$   r   r9   rD   ÚcatÚchunkÚrepeatÚmatmulr"   Úsqrtr   Ú
functionalÚsoftmaxÚfloat32Útor`   Údropoutr3   rb   r#   ÚsumrG   rH   rI   rJ   rB   rM   Ú
contiguousÚreshaperA   )rO   rQ   rR   rS   rT   rU   rV   rW   ÚkwargsÚbszÚ
target_lenÚ_Úq_lenÚquery_statesÚ
key_statesÚvalue_statesr[   rZ   Úcache_kwargsÚattn_weightsÚlambda_1Úlambda_2Úlambda_fullÚattn_outputÚattn_output1Úattn_output2r   r   r    Úforwarda   sJ   


  ÿ ÿ
zDiffLlamaAttention.forward©N©NNNFN)r   r   r   Ú__doc__r   Úintr/   rD   ÚTensorÚtupleÚ
LongTensorr   Úboolrƒ   Ú__classcell__r   r   rP   r    r'   =   s2    %øþýüûúùø
ör'   c                       s„   e Zd ZdZ‡ fdd„Z					ddejdeejejf dejdB d	ejdB d
e	dB de
dejdB deejdf fdd„Z‡  ZS )ÚDiffLlamaFlashAttention2aN  
    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                    s   t ƒ j|i |¤Ž tƒ | _d S r„   )r.   r/   r   Ú_flash_attn_uses_top_left_mask)rO   Úargsrs   rP   r   r    r/   ¦   s   z!DiffLlamaFlashAttention2.__init__NFrQ   rR   rS   rT   rU   rV   rW   rX   c                 C   sä  t |tƒr	tdƒ‚| ¡ \}}	}
|  |¡}|  |¡}|  |¡}| ||	| j| j	¡ 
dd¡}| ||	| j| j	¡ 
dd¡}| ||	| j| j	¡ 
dd¡}|\}}t||||ƒ\}}|d urk|||dœ}| ||| j|¡\}}| 
dd¡}| 
dd¡}| 
dd¡}| jrƒ| jnd}|j}|jjdkr’|jjnd}|tjkrÌt |¡r¤t |¡}nt| jdƒr¯| jj}n| jjj}t d	|› d
¡ | |¡}| |¡}| |¡}tj|ddd\}}| dddd¡}| dddd¡}t |||||	||t!| dd ƒ| j"| j#d
}t |||||	||t!| dd ƒ| j"| j#d
}tj$||gdd}tj|ddd\}}t %tj&| j'| j( dtjd¡ |j¡}t %tj&| j)| j* dtjd¡ |j¡}|| | j+ }|||  }d| j+ |  ,|¡ }| -||	d¡ .¡ }|  /|¡}|d fS )NzÈ`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformersr   r   rY   ç        ÚmpsÚcpuÚ_is_quantizedz¾The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in Ú.r\   Úsliding_window)rT   ro   r•   Úuse_top_left_maskr;   r^   r_   )0Ú
isinstancer   Ú
ValueErrorr+   r>   r?   r@   rc   r6   r)   rd   r8   r   re   r$   rb   r3   r`   ÚdeviceÚtyperD   rm   Úis_autocast_enabledÚget_autocast_dtypeÚhasattrr(   Úweightr0   r1   rn   rg   rh   r   r7   rŽ   r;   rf   r#   rp   rG   rH   rI   rJ   rB   rM   rr   rq   rA   )rO   rQ   rR   rS   rT   rU   rV   rW   rt   rw   rv   rx   ry   rz   r[   rZ   r{   Údropout_rateÚinput_dtypeÚdevice_typeÚtarget_dtypeÚvalue_states1Úvalue_states2r   r‚   r€   r}   r~   r   r   r   r    rƒ   ®   sš   

ÿ






þÿ



ö
ö ÿ ÿ
z DiffLlamaFlashAttention2.forwardr…   )r   r   r   r†   r/   rD   rˆ   r‰   rŠ   r   r‹   rƒ   rŒ   r   r   rP   r    r   Ÿ   s2    øþýüûúùø	÷r   c                   @   s†   e Zd ZdZ					ddejdeejejf dejdB dejdB dedB d	e	d
ejdB deejejdB eej dB f fdd„Z
dS )ÚDiffLlamaSdpaAttentiona   
    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
    SDPA API.
    NFrQ   rR   rS   rT   rU   rV   rW   rX   c                 K   s0  |  ¡ \}	}
}|  |¡}|  |¡}|  |¡}| |	|
| j| j¡ dd¡}| |	|
| j| j¡ dd¡}| |	|
| j| j¡ dd¡}|\}}t	||||ƒ\}}|d urb|||dœ}| 
||| j|¡\}}t|| jƒ}t|| jƒ}tjtj|ddddd}| dddd¡}|}|d urœ|d d …d d …d d …d |jd …f }|d u o£|
dk}tjjj||||| jr²| jnd|d}tj|ddd\}}t tj| j| j dtjd	¡ |j¡}t tj| j| j  dtjd	¡ |j¡}|| | j! }|||  }d| j! |  "|¡ }| dd¡ #¡ }| |	|
d¡}|  $|¡}|d fS )
Nr   r   rY   r\   r^   éþÿÿÿr   )Ú	attn_maskÚ	dropout_pr;   r_   )%r+   r>   r?   r@   rc   r6   r)   rd   r8   r   re   r$   r   r9   rD   rf   rg   rh   Úshaper   rk   Úscaled_dot_product_attentionrb   r3   r#   rp   rG   rH   rm   rn   r`   rI   rJ   rB   rM   rq   rA   )rO   rQ   rR   rS   rT   rU   rV   rW   rs   rt   rw   rv   rx   ry   rz   r[   rZ   r{   Úcausal_maskr;   r€   r   r‚   r}   r~   r   r   r   r    rƒ   .  sT   


&ú	 ÿ ÿ
zDiffLlamaSdpaAttention.forwardr…   )r   r   r   r†   rD   rˆ   r‰   rŠ   r   r‹   rƒ   r   r   r   r    r¥   &  s0    øþýüûúùø
ör¥   )ÚeagerÚflash_attention_2Úsdpac                       s&   e Zd Zdedef‡ fdd„Z‡  ZS )ÚDiffLlamaDecoderLayerr(   r$   c                    s&   t ƒ  ||¡ t|j ||d| _d S )N)r(   r$   )r.   r/   ÚDIFFLLAMA_ATTENTION_CLASSESÚ_attn_implementationÚ	self_attnrN   rP   r   r    r/   {  s   zDiffLlamaDecoderLayer.__init__)r   r   r   r   r‡   r/   rŒ   r   r   rP   r    r¯   z  s    r¯   c                   @   s$   e Zd ZdZdZe ¡ dd„ ƒZdS )ÚDiffLlamaPreTrainedModelFc                 C   sn   t  | |¡ t|tƒr5t |jd| jj¡ t |j	d| jj¡ t |j
d| jj¡ t |jd| jj¡ d S d S )Nr   )r	   Ú_init_weightsr—   r'   ÚinitÚnormal_rG   r(   rF   rH   rI   rJ   )rO   Úmoduler   r   r    r´   …  s   
üz&DiffLlamaPreTrainedModel._init_weightsN)r   r   r   Ú_supports_flex_attnÚ_supports_attention_backendrD   Úno_gradr´   r   r   r   r    r³     s
    r³   c                   @   r   )ÚDiffLlamaModelNr   r   r   r   r    r»     r!   r»   c                   @   r   )ÚDiffLlamaForCausalLMNr   r   r   r   r    r¼   “  r!   r¼   c                   @   r   )Ú"DiffLlamaForSequenceClassificationNr   r   r   r   r    r½   —  r!   r½   c                   @   r   )ÚDiffLlamaForQuestionAnsweringNr   r   r   r   r    r¾   ›  r!   r¾   c                   @   r   )ÚDiffLlamaForTokenClassificationNr   r   r   r   r    r¿   Ÿ  r!   r¿   )r³   r»   r¼   r½   r¾   r¿   )5r"   rD   r   Ú r   rµ   Úcache_utilsr   r   Úmodeling_flash_attention_utilsr   r   Úmodeling_utilsr	   Úutilsr
   Úgemma.modeling_gemmar   Úllama.modeling_llamar   r   r   r   r   r   r   r   r   Úmistral.modeling_mistralr   Úconfiguration_diffllamar   Ú
get_loggerr   r0   Ú_CHECKPOINT_FOR_DOCÚ_CONFIG_FOR_DOCr   r%   r&   ÚModuler'   r   r¥   r°   r¯   r³   r»   r¼   r½   r¾   r¿   Ú__all__r   r   r   r    Ú<module>   sD   ,
b Ný