o
    i%                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ ddlmZm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ e0e1Z2G dd de(Z3G dd de"Z4e5e  e5dkrG dd dej6Z7nedG dd dej8Z7G dd  d e#Z9G d!d" d"e)Z:G d#d$ d$e-Z;G d%d& d&e$Z<G d'd( d(e&Z=G d)d* d*e'Z>G d+d, d,e%Z?g d-Z@dS ).    )CallableOptionalN)version)nn   )CacheDynamicCache)use_kernel_forward_from_hub)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)deprecate_kwarg)check_model_inputs)get_torch_version   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLPLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward)MistralModel   )Qwen2Configc                       s   e Zd Z fddZ  ZS )Qwen2MLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__ d/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/qwen2/modular_qwen2.pyr(   (   s   zQwen2MLP.__init__)__name__
__module____qualname__r(   __classcell__r4   r4   r2   r5   r$   '   s    r$   c                       s   e Zd Zdedef fddZedddd				dd
ejde	ejejf de
ej de
e de
ej dee de	eje
ej f fddZ  ZS )Qwen2Attentionr1   	layer_idxc                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _|j| dkrL|j| _d S d | _d S )NTr%   Fsliding_attention)r'   r(   r   r)   r*   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projlayer_typessliding_windowr0   r1   r;   r2   r4   r5   r(   0   s   $zQwen2Attention.__init__past_key_valuepast_key_valuesz4.58)new_namer   Nhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                 K   s(  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr"   r   )sincosrM   eagerg        )dropoutscalingrE   )shaper>   r?   view	transposerA   rB   r   updater;   r    r1   _attn_implementationr   trainingattention_dropoutrU   rE   reshape
contiguousrC   )r0   rJ   rK   rL   rH   rM   rN   input_shapehidden_shapequery_states
key_statesvalue_statesrR   rQ   cache_kwargsattention_interfaceattn_outputattn_weightsr4   r4   r5   forward8   s:   
	

zQwen2Attention.forward)NN)r6   r7   r8   r#   intr(   r   torchTensortupler   r   
LongTensorr   r   rh   r9   r4   r4   r2   r5   r:   /   s(    r:   z2.3.0c                       s(   e Zd Zddeddf fddZ  ZS )Qwen2RMSNormư>epsrO   Nc                    s   t  j||dd d S )NT)normalized_shaperp   elementwise_affine)r'   r(   r0   r*   rp   r2   r4   r5   r(   i   s   Qwen2RMSNorm.__init__ro   )r6   r7   r8   floatr(   r9   r4   r4   r2   r5   rn   h   s     rn   RMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )rn   ro   rp   rO   Nc                    s&   t    tt|| _|| _dS )zC
            Qwen2RMSNorm is equivalent to T5LayerNorm
            N)r'   r(   r   	Parameterrj   onesweightvariance_epsilonrs   r2   r4   r5   r(   p   s   

rt   rJ   c                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   rP   T)keepdim)	dtypetorj   float32powmeanrsqrtr{   rz   )r0   rJ   input_dtypevariancer4   r4   r5   rh   x   s
   zQwen2RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)rl   rz   rV   r{   )r0   r4   r4   r5   
extra_repr   s   zQwen2RMSNorm.extra_reprru   )
r6   r7   r8   rv   r(   rj   rk   rh   r   r9   r4   r4   r2   r5   rn   n   s    c                       s&   e Zd Zdedef fddZ  ZS )Qwen2DecoderLayerr1   r;   c                    s    t  j||d |j| | _d S )N)r1   r;   )r'   r(   rD   attention_typerF   r2   r4   r5   r(      s   zQwen2DecoderLayer.__init__)r6   r7   r8   r#   ri   r(   r9   r4   r4   r2   r5   r      s    r   c                   @      e Zd ZdS )Qwen2PreTrainedModelNr6   r7   r8   r4   r4   r4   r5   r          r   c                       s   e Zd Zdef fddZee							ddeej	 deej
 deej	 dee d	eej d
ee deej	 dee defddZ  ZS )
Qwen2Modelr1   c                    s   t  | d| jjv | _d S )Nr<   )r'   r(   r1   rD   has_sliding_layersr/   r2   r4   r5   r(      s   zQwen2Model.__init__N	input_idsrL   position_idsrH   inputs_embeds	use_cacherM   rN   rO   c              
   K   sF  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}t
| }
tsl| j|||||d}dtdi |i}
| jrltdi ||
d< |}| ||}| jd | jj D ]}||f|
|j |||||d	|}q}| |}t||r|d
S d d
S )Nz:You must specify exactly one of input_ids or inputs_embeds)r1   r   r"   )device)r1   input_embedsrL   rM   rH   r   full_attentionr<   )rL   r   rH   r   rM   rK   )last_hidden_staterH   r4   )
ValueErrorembed_tokensr   r1   get_seq_lengthrj   arangerV   r   	unsqueeze
isinstancedictr
   r   r   
rotary_emblayersnum_hidden_layersr   normr   )r0   r   rL   r   rH   r   r   rM   rN   past_seen_tokenscausal_mask_mappingmask_kwargsrJ   rK   decoder_layerr4   r4   r5   rh      s^   



zQwen2Model.forward)NNNNNNN)r6   r7   r8   r#   r(   r   r   r   rj   rm   rk   r   FloatTensorboolr   r   r   rh   r9   r4   r4   r2   r5   r      s<    	
r   c                   @   r   )Qwen2ForCausalLMNr   r4   r4   r4   r5   r      r   r   c                   @   r   )Qwen2ForSequenceClassificationNr   r4   r4   r4   r5   r      r   r   c                   @   r   )Qwen2ForTokenClassificationNr   r4   r4   r4   r5   r      r   r   c                   @   r   )Qwen2ForQuestionAnsweringNr   r4   r4   r4   r5   r      r   r   )r   r   r   rn   r   r   r   )Atypingr   r   rj   	packagingr   r   cache_utilsr   r   integrationsr	   masking_utilsr
   r   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   utils.import_utilsr   llama.modeling_llamar   r   r   r   r   r   r   r   r   r    mistral.modeling_mistralr!   configuration_qwen2r#   
get_loggerr6   loggerr$   r:   parserw   rn   Moduler   r   r   r   r   r   r   __all__r4   r4   r4   r5   <module>   sB    0
7O