o
    wi+                     @   s~  d dl mZmZmZ d dlZd dlZd dlmZ ddlmZm	Z	 ddl
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' e(e)Z*G dd de!Z+G dd deZ,G dd deZ-G dd de#Z.G dd de"Z/G dd deZ0G dd de Z1G dd deZ2G d d! d!eZ3g d"Z4dS )#    )CallableOptionalUnionN)nn   )CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)BaseModelOutputWithPastQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringcan_return_tuplelogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward   )MistralConfigc                       s   e Zd Z fddZ  ZS )
MistralMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__ h/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/mistral/modular_mistral.pyr%   "   s   zMistralMLP.__init__)__name__
__module____qualname__r%   __classcell__r1   r1   r/   r2   r!   !   s    r!   c                       s   e Zd Zdedef fddZ		ddejdeejejf de	ej d	e	e
 d
e	ej dee deeje	ej e	eej  f fddZ  ZS )MistralAttentionr.   	layer_idxc                    s   t    t|dd p|j|j | _tj|j|j| j dd| _tj|j|j	| j dd| _
tj|j|j	| j dd| _tj|j| j |jdd| _d S )Nhead_dimFr"   )r$   r%   getattrr'   num_attention_headsr9   r   r&   q_projnum_key_value_headsk_projv_projo_projr-   r.   r8   r/   r1   r2   r%   *   s   
 zMistralAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                 K   s0  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
}| jjdkret| jj }|| |	|
||f| jsqdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )	Nr   r   )sincosrF   eagerg        sliding_window)dropoutscalingrM   )shaper9   r<   view	transposer>   r?   r   updater8   r   r.   _attn_implementationr   trainingattention_dropoutrO   r:   reshape
contiguousr@   )r-   rB   rC   rD   rE   rF   rG   input_shapehidden_shapequery_states
key_statesvalue_statesrK   rJ   cache_kwargsattention_interfaceattn_outputattn_weightsr1   r1   r2   forward2   s:   		

zMistralAttention.forward)NN)r3   r4   r5   r    intr%   torchTensortupler   r   
LongTensorr   r   rb   r6   r1   r1   r/   r2   r7   )   s&    r7   c                       s&   e Zd Zdedef fddZ  ZS )MistralDecoderLayerr.   r8   c                    s*   t  || t||d| _t|| _d S )N)r.   r8   )r$   r%   r7   	self_attnr!   mlprA   r/   r1   r2   r%   `   s   zMistralDecoderLayer.__init__)r3   r4   r5   r    rc   r%   r6   r1   r1   r/   r2   rh   _   s    rh   c                   @      e Zd ZdS )MistralPreTrainedModelNr3   r4   r5   r1   r1   r1   r2   rl   f       rl   c                   @   s   e Zd Zee									ddeej deej deej dee	 deej
 dee dee d	ee d
eej dee defddZdS )MistralModelN	input_idsrD   position_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesrF   flash_attn_kwargsrH   c
                 K   s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u |d uA r*td| jr9| jr9|r9td d}t	|t
d tfsFtd|d u rO| |}|rX|d u rXt }|	d u rt|d urd| nd}tj|||jd  |jd}	|d u r}|	d}| j jd u rtnt}|| j |||	||d}|}| ||}|rd	nd }|rd	nd }| jd | j j D ]&}|r||f7 }||f||||||	|d
|
}|d }|r||d f7 }q| |}|r||f7 }t||r|nd ||dS )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBThe `past_key_values` should be either a `Cache` object or `None`.r   r   )device)r.   input_embedsrD   rF   rr   rq   r1   )rD   rq   rE   ru   rt   rF   rC   )last_hidden_staterr   rB   
attentions)r.   ru   rv   rt   
ValueErrorgradient_checkpointingrU   loggerwarning_once
isinstancetyper   embed_tokensr   get_seq_lengthrd   arangerP   rx   	unsqueezerM   r	   r
   
rotary_emblayersnum_hidden_layersnormr   )r-   rp   rD   rq   rr   rs   rt   ru   rv   rF   rw   past_seen_tokensmask_functioncausal_maskrB   rC   all_hidden_statesall_self_attnsdecoder_layerlayer_outputsr1   r1   r2   rb   k   s   

	
	


zMistralModel.forward	NNNNNNNNN)r3   r4   r5   r   r   r   rd   rg   re   r   FloatTensorboolr   r   r   rb   r1   r1   r1   r2   ro   j   sF    	
ro   c                   @   rk   )MistralForCausalLMNrm   r1   r1   r1   r2   r      rn   r   c                   @   rk   )MistralForTokenClassificationNrm   r1   r1   r1   r2   r      rn   r   c                   @   rk   ) MistralForSequenceClassificationNrm   r1   r1   r1   r2   r      rn   r   c                       s   e Zd ZdZ fddZdd Zdd Z									dd	eej	 d
eej
 deej	 deeeeej f  deej deej	 deej	 dee dee defddZ  ZS )MistralForQuestionAnsweringmodelc                    s   t  | t|| _| `d S N)r$   r%   ro   r   transformerr,   r/   r1   r2   r%      s   
z$MistralForQuestionAnswering.__init__c                 C   s   | j jS r   r   r   )r-   r1   r1   r2   get_input_embeddings   s   z0MistralForQuestionAnswering.get_input_embeddingsc                 C   s   || j _d S r   r   )r-   valuer1   r1   r2   set_input_embeddings   s   z0MistralForQuestionAnswering.set_input_embeddingsNrp   rD   rq   rr   rs   start_positionsend_positionsru   rv   rH   c
              	   K   s   | j |||||||	d}|j}| |}|jddd\}}|d }|d }d }|d urA|d urA| j||||fi |
}t||||j|j	dS )N)rD   rq   rr   rs   ru   rv   r   rI   )dim)lossstart_logits
end_logitsrB   r{   )
r   rz   
qa_outputssplitsqueezerX   loss_functionr   rB   r{   )r-   rp   rD   rq   rr   rs   r   r   ru   rv   rG   outputssequence_outputlogitsr   r   r   r1   r1   r2   rb      s0   

z#MistralForQuestionAnswering.forwardr   )r3   r4   r5   base_model_prefixr%   r   r   r   rd   rg   re   r   r   listr   r   r   rb   r6   r1   r1   r/   r2   r      sF    	
r   )r   r   ro   rl   r   r   )5typingr   r   r   rd   torch.utils.checkpointr   cache_utilsr   r   masking_utilsr	   r
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   r   configuration_mistralr    
get_loggerr3   r~   r!   r7   rh   rl   ro   r   r   r   r   __all__r1   r1   r1   r2   <module>   s0    4
6h9