o
    wi%                     @   s~  d dl mZmZ d dlZd dlZd dlmZ ddlmZmZ ddl	m
Z
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& e'e(Z)G dd deZ*G dd deZ+G dd deZ,G dd de Z-G dd de$Z.G dd deZ/G dd deZ0G dd  d eZ1G d!d" d"eZ2g d#Z3dS )$    )CallableOptionalN)nn   )CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tuplelogging   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLPLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward)MistralModel   )Qwen2Configc                       s   e Zd Z fddZ  ZS )Qwen2MLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__ d/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/qwen2/modular_qwen2.pyr#   $   s   zQwen2MLP.__init__)__name__
__module____qualname__r#   __classcell__r/   r/   r-   r0   r   #   s    r   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )Qwen2Attentionr,   	layer_idxc                    s   t  || tj|j|j| j dd| _tj|j|j| j dd| _	tj|j|j| j dd| _
tj|j| j |jdd| _|j| dkrL|j| _d S d | _d S )NTr    Fsliding_attention)r"   r#   r   r$   r%   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_projlayer_typessliding_windowr+   r,   r6   r-   r/   r0   r#   ,   s   $zQwen2Attention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                 K   s(  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| j| jd|\}}|jg |dR   }| |}||fS )Nr   r   )sincosrF   eagerg        )dropoutscalingr@   )shaper9   r:   view	transposer<   r=   r   updater6   r   r,   _attn_implementationr   trainingattention_dropoutrN   r@   reshape
contiguousr>   )r+   rB   rC   rD   rE   rF   rG   input_shapehidden_shapequery_states
key_statesvalue_statesrK   rJ   cache_kwargsattention_interfaceattn_outputattn_weightsr/   r/   r0   forward4   s:   		

zQwen2Attention.forward)NN)r1   r2   r3   r   intr#   torchTensortupler   r   
LongTensorr   r
   ra   r4   r/   r/   r-   r0   r5   +   s&    r5   c                       s&   e Zd Zdedef fddZ  ZS )Qwen2DecoderLayerr,   r6   c                    s   t    |j| | _d S )N)r"   r#   r?   attention_typerA   r-   r/   r0   r#   b   s   
zQwen2DecoderLayer.__init__)r1   r2   r3   r   rb   r#   r4   r/   r/   r-   r0   rg   a   s    rg   c                   @      e Zd ZdS )Qwen2PreTrainedModelNr1   r2   r3   r/   r/   r/   r0   rj   g       rj   c                       s   e Zd Zdef fddZee									ddeej	 deej
 deej	 dee d	eej d
ee dee dee deej	 dee defddZ  ZS )
Qwen2Modelr,   c                    s   t  | d| jjv | _d S )Nr7   )r"   r#   r,   r?   has_sliding_layersr*   r-   r/   r0   r#   l   s   zQwen2Model.__init__N	input_idsrD   position_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesrF   flash_attn_kwargsrH   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}t	|t
d tfsFtd|d u rO| |}|rX|d u rXt }|	d u rt|d urd| nd}tj|||jd  |jd}	|d u r}|	d}t	| }ts| j |||	||d}d	tdi |i}| jrtdi ||d
< |}| ||}|rdnd }|rdnd }| jd | j j D ])}|r||f7 }||f||j |||||	|d|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   )device)r,   input_embedsrD   rF   rq   rp   full_attentionr7   r/   )rD   rp   rE   rt   rs   rF   rC   )last_hidden_staterq   rB   
attentions)r,   rt   ru   rs   
ValueErrorgradient_checkpointingrT   loggerwarning_once
isinstancetyper   embed_tokensr   get_seq_lengthrc   arangerO   rw   	unsqueezedictr   rn   r	   
rotary_emblayersnum_hidden_layersrh   normr   )r+   ro   rD   rp   rq   rr   rs   rt   ru   rF   rv   past_seen_tokenscausal_mask_mappingmask_kwargsrB   rC   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr/   r/   r0   ra   p   s   



	


zQwen2Model.forward)	NNNNNNNNN)r1   r2   r3   r   r#   r   r   r   rc   rf   rd   r   FloatTensorboolr   r
   r   ra   r4   r/   r/   r-   r0   rm   k   sH    	
rm   c                   @   ri   )Qwen2ForCausalLMNrk   r/   r/   r/   r0   r      rl   r   c                   @   ri   )Qwen2ForSequenceClassificationNrk   r/   r/   r/   r0   r      rl   r   c                   @   ri   )Qwen2ForTokenClassificationNrk   r/   r/   r/   r0   r      rl   r   c                   @   ri   )Qwen2ForQuestionAnsweringNrk   r/   r/   r/   r0   r      rl   r   )rj   rm   r   r   r   r   )4typingr   r   rc   torch.utils.checkpointr   cache_utilsr   r   masking_utilsr   r	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_qwen2r   
get_loggerr1   r~   r   r5   rg   rj   rm   r   r   r   r   __all__r/   r/   r/   r0   <module>   s2    0
6u