o
    i                     @   s  d dl mZmZ d dlZd dlmZ d dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z) e*e+Z,G dd de#Z-G dd deZ.G dd deZ/G dd de%Z0G dd de$Z1G dd de Z2G dd  d e"Z3G d!d" d"e!Z4G d#d$ d$ee0Z5g d%Z6dS )&    )CallableOptionalN)nn)check_model_inputs   )CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)deprecate_kwarg   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward   )MistralConfigc                       s   e Zd Z fddZ  ZS )
MistralMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_proj)selfconfig	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/mistral/modular_mistral.pyr%   &   s   zMistralMLP.__init__)__name__
__module____qualname__r%   __classcell__r0   r0   r.   r1   r!   %   s    r!   c                       s   e Zd Zdedef fddZedddd				dd
ejde	ejejf de
ej de
e de
ej dee de	eje
ej f fddZ  ZS )MistralAttentionr-   	layer_idxc                    s   t  || t|dd p|j|j | _tj|j|j| j dd| _tj|j|j	| j dd| _
tj|j|j	| j dd| _tj|j| j |jdd| _d S )Nhead_dimFr"   )r$   r%   getattrr'   num_attention_headsr8   r   r&   q_projnum_key_value_headsk_projv_projo_projr,   r-   r7   r.   r0   r1   r%   .   s    zMistralAttention.__init__past_key_valuepast_key_valuesz4.58)new_nameversionNhidden_statesposition_embeddingsattention_maskcache_positionkwargsreturnc                 K   s0  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )	Nr   r   )sincosrH   eagerg        sliding_window)dropoutscalingrO   )shaper8   r;   view	transposer=   r>   r   updater7   r   r-   _attn_implementationr   trainingattention_dropoutrQ   r9   reshape
contiguousr?   )r,   rE   rF   rG   rB   rH   rI   input_shapehidden_shapequery_states
key_statesvalue_statesrM   rL   cache_kwargsattention_interfaceattn_outputattn_weightsr0   r0   r1   forward6   s:   
	

zMistralAttention.forward)NN)r2   r3   r4   r    intr%   r   torchTensortupler   r   
LongTensorr   r   rd   r5   r0   r0   r.   r1   r6   -   s(    r6   c                       s&   e Zd Zdedef fddZ  ZS )MistralDecoderLayerr-   r7   c                    s*   t  || t||d| _t|| _d S )N)r-   r7   )r$   r%   r6   	self_attnr!   mlpr@   r.   r0   r1   r%   e   s   zMistralDecoderLayer.__init__)r2   r3   r4   r    re   r%   r5   r0   r0   r.   r1   rj   d   s    rj   c                   @   s   e Zd ZeedZdS )MistralPreTrainedModel)rE   
attentionsN)r2   r3   r4   rj   r6   _can_record_outputsr0   r0   r0   r1   rm   k   s    
rm   c                   @   s|   e Zd Zee							ddeej deej deej dee	 deej
 dee deej d	ee d
efddZdS )MistralModelN	input_idsrG   position_idsrB   inputs_embeds	use_cacherH   rI   rJ   c              
   K   s  |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}| ||}| jd | jj D ]}||f||||||d|}ql| |}t||r|dS d dS )	Nz:You must specify exactly one of input_ids or inputs_embeds)r-   r   r   )device)r-   input_embedsrG   rH   rB   rr   )rG   rr   rB   rt   rH   rF   )last_hidden_staterB   )
ValueErrorembed_tokensr   r-   get_seq_lengthrf   arangerR   ru   	unsqueezerO   r	   r
   
rotary_emblayersnum_hidden_layersnormr   )r,   rq   rG   rr   rB   rs   rt   rH   rI   past_seen_tokensmask_functioncausal_maskrE   rF   decoder_layerr0   r0   r1   rd   s   sX   

	

zMistralModel.forward)NNNNNNN)r2   r3   r4   r   r   r   rf   ri   rg   r   FloatTensorboolr   r   r   rd   r0   r0   r0   r1   rp   r   s:    	
rp   c                   @      e Zd ZdS )MistralForCausalLMNr2   r3   r4   r0   r0   r0   r1   r          r   c                   @   r   )MistralForTokenClassificationNr   r0   r0   r0   r1   r      r   r   c                   @   r   ) MistralForSequenceClassificationNr   r0   r0   r0   r1   r      r   r   c                   @   r   )MistralForQuestionAnsweringNr   r0   r0   r0   r1   r      s    r   )r   r   rp   rm   r   r   )7typingr   r   rf   r   transformers.utils.genericr   cache_utilsr   r   masking_utilsr	   r
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.deprecationr   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   configuration_mistralr    
get_loggerr2   loggerr!   r6   rj   rm   rp   r   r   r   r   __all__r0   r0   r0   r1   <module>   s4    0
7?