o
    eil                     @   s  d dl mZ d dlZd dlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( e)e*Z+G dd de"Z,G dd deZ-G dd deZ.G dd de$Z/G dd de#Z0G dd deZ1G dd  d e!Z2G d!d" d"e Z3G d#d$ d$ee/Z4g d%Z5dS )&    )CallableN)nn   )CacheDynamicCache)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GenericForQuestionAnswering)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)capture_outputs   )
LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward   )MistralConfigc                       s   e Zd Z fddZ  ZS )
MistralMLPc                    sR   t  | tj| j| jdd| _tj| j| jdd| _tj| j| jdd| _d S )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_proj)selfconfig	__class__ i/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mistral/modular_mistral.pyr$   %   s   zMistralMLP.__init__)__name__
__module____qualname__r$   __classcell__r/   r/   r-   r0   r    $   s    r    c                       s   e Zd Zdedef fddZ		ddejdeejejf dejdB d	e	dB d
ej
dB dee deejejdB f fddZ  ZS )MistralAttentionr,   	layer_idxc                    s   t  || t|dd p|j|j | _tj|j|j| j dd| _tj|j|j	| j dd| _
tj|j|j	| j dd| _tj|j| j |jdd| _d S )Nhead_dimFr!   )r#   r$   getattrr&   num_attention_headsr7   r   r%   q_projnum_key_value_headsk_projv_projo_projr+   r,   r6   r-   r/   r0   r$   -   s    zMistralAttention.__init__Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsreturnc                 K   s$  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|d urW|||d}||
|| j	|\}
}t
| jjt}|| |	|
||f| jskdn| j| jt| jdd d|\}}|jg |dR   }| |}||fS )Nr   r   )sincosrD   g        sliding_window)dropoutscalingrJ   )shaper7   r:   view	transposer<   r=   r   updater6   r   get_interfacer,   _attn_implementationr   trainingattention_dropoutrL   r8   reshape
contiguousr>   )r+   r@   rA   rB   rC   rD   rE   input_shapehidden_shapequery_states
key_statesvalue_statesrI   rH   cache_kwargsattention_interfaceattn_outputattn_weightsr/   r/   r0   forward5   s:   		

zMistralAttention.forward)NN)r1   r2   r3   r   intr$   torchTensortupler   
LongTensorr   r	   r`   r4   r/   r/   r-   r0   r5   ,   s&    r5   c                       s&   e Zd Zdedef fddZ  ZS )MistralDecoderLayerr,   r6   c                    s*   t  || t||d| _t|| _d S )N)r,   r6   )r#   r$   r5   	self_attnr    mlpr?   r-   r/   r0   r$   c   s   zMistralDecoderLayer.__init__)r1   r2   r3   r   ra   r$   r4   r/   r/   r-   r0   rf   b   s    rf   c                   @   s   e Zd ZeedZdS )MistralPreTrainedModel)r@   
attentionsN)r1   r2   r3   rf   r5   _can_record_outputsr/   r/   r/   r0   ri   i   s    
ri   c                   @   s   e Zd Zeee							ddejdB dejdB dejdB de	dB dej
dB dedB dejdB d	ee d
efddZdS )MistralModelN	input_idsrB   position_idsrC   inputs_embeds	use_cacherD   rE   rF   c              
   K   s   |d u |d uA rt d|d u r| |}|r!|d u r!t| jd}|d u r=|d ur-| nd}	tj|	|	|jd  |jd}|d u rF|	d}| jj
d u rNtnt}
|
| j|||||d}|}| j||d}| jd | jj D ]}||f||||||d|}qm| |}t||r|d	S d d	S )
Nz:You must specify exactly one of input_ids or inputs_embeds)r,   r   r   )device)r,   ro   rB   rD   rC   rn   )rn   )rB   rn   rC   rp   rD   rA   )last_hidden_staterC   )
ValueErrorembed_tokensr   r,   get_seq_lengthrb   arangerM   rq   	unsqueezerJ   r   r   
rotary_emblayersnum_hidden_layersnormr   )r+   rm   rB   rn   rC   ro   rp   rD   rE   past_seen_tokensmask_functioncausal_maskr@   rA   decoder_layerr/   r/   r0   r`   q   sX   

	

zMistralModel.forward)NNNNNNN)r1   r2   r3   r   r   r   rb   re   rc   r   FloatTensorboolr   r   r   r`   r/   r/   r/   r0   rl   p   s<    	
rl   c                   @      e Zd ZdS )MistralForCausalLMNr1   r2   r3   r/   r/   r/   r0   r          r   c                   @   r   )MistralForTokenClassificationNr   r/   r/   r/   r0   r      r   r   c                   @   r   ) MistralForSequenceClassificationNr   r/   r/   r/   r0   r      r   r   c                   @   r   )MistralForQuestionAnsweringNr   r/   r/   r/   r0   r      s    r   )r   r   rl   ri   r   r   )6collections.abcr   rb   r   cache_utilsr   r   masking_utilsr   r   modeling_flash_attention_utilsr	   modeling_layersr
   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   configuration_mistralr   
get_loggerr1   loggerr    r5   rf   ri   rl   r   r   r   r   __all__r/   r/   r/   r0   <module>   s4    0
6@