o
    ei                     @   sR  d dl mZ d dlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ eeZdejdededejfddZ G dd deZ!G dd deZ"eG dd deZ#eG dd deZ$eG dd deZ%G dd de
e#Z&G dd de	e#Z'G d d! d!ee#Z(g d"Z)dS )#    )CallableN   )Cache)FlashAttentionKwargs)GenericForQuestionAnswering GenericForSequenceClassificationGenericForTokenClassification)ALL_ATTENTION_FUNCTIONS)Unpack)auto_docstringlogging   )MistralAttentionMistralDecoderLayerMistralForCausalLMMistralModelMistralPreTrainedModelapply_rotary_pos_embeager_attention_forwardpositions_idsbetamax_position_embeddingsreturnc              	   C   s*   d|t dt | |    }|dS )N   )torchlogfloor	unsqueeze)r   r   r   scaling r    o/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ministral3/modular_ministral3.py_get_llama_4_attn_scale   s    
r"   c                   @   sj   e Zd Z		ddejdeejejf dejdB dedB dejdB dee	 deejejdB f fd	d
Z
dS )Ministral3AttentionNhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionkwargsr   c                 K   sP  |j d d }g |d| jR }| ||dd}	| ||dd}
| ||dd}|\}}t|	|
||\}	}
|	t|| j	j
d| j	j
d|	j }	|d urm|||d}||
|| j|\}
}t| j	jt}|| |	|
||f| jsdn| j| jt| j	dd d	|\}}|jg |dR   }| |}||fS )
Nr   r   r   llama_4_scaling_beta original_max_position_embeddings)sincosr(   g        sliding_window)dropoutr   r.   )shapehead_dimq_projview	transposek_projv_projr   r"   configrope_parametersgettodtypeupdate	layer_idxr	   get_interface_attn_implementationr   trainingattention_dropoutr   getattrreshape
contiguouso_proj)selfr$   r%   r&   r'   r(   r)   input_shapehidden_shapequery_states
key_statesvalue_statesr-   r,   cache_kwargsattention_interfaceattn_outputattn_weightsr    r    r!   forward#   sH   		

zMinistral3Attention.forward)NN)__name__
__module____qualname__r   Tensortupler   
LongTensorr
   r   rP   r    r    r    r!   r#   "   s$    r#   c                   @      e Zd ZdS )Ministral3DecoderLayerNrQ   rR   rS   r    r    r    r!   rX   U       rX   c                   @   rW   )Ministral3PreTrainedModelNrY   r    r    r    r!   r[   Y       r[   c                   @   rW   )Ministral3ModelNrY   r    r    r    r!   r]   ^   r\   r]   c                   @   rW   )Ministral3ForCausalLMNrY   r    r    r    r!   r^   c   r\   r^   c                   @   rW   ) Ministral3ForTokenClassificationNrY   r    r    r    r!   r_   h   rZ   r_   c                   @   rW   )#Ministral3ForSequenceClassificationNrY   r    r    r    r!   r`   l   rZ   r`   c                   @   rW   )Ministral3ForQuestionAnsweringNrY   r    r    r    r!   ra   p   rZ   ra   )r^   ra   r]   r[   r`   r_   )*collections.abcr   r   cache_utilsr   modeling_flash_attention_utilsr   modeling_layersr   r   r   modeling_utilsr	   processing_utilsr
   utilsr   r   mistral.modeling_mistralr   r   r   r   r   r   r   
get_loggerrQ   loggerrT   floatintr"   r#   rX   r[   r]   r^   r_   r`   ra   __all__r    r    r    r!   <module>   s.    $
3