o
    eiX                     @   s  d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZ edG dd dejZ G dd dejZ!G dd dejZ"eeddG dd deZ#eeddG dd deZ$eG d d! d!eZ%ed"dG d#d$ d$e%Z&ed%dG d&d' d'e%e	Z'g d(Z(dS ))    )	dataclassN)nn   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuplemerge_with_config_defaults   )	AutoModel   )Mistral3ConfigRMSNormc                       sF   e Zd Zddeddf fddZdejdejfdd	Zd
d Z  Z	S )Mistral3RMSNormư>epsreturnNc                    s&   t    tt|| _|| _dS )z>
        Mistral3RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer   	__class__ l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mistral3/modeling_mistral3.pyr   *   s   

zMistral3RMSNorm.__init__hidden_statesc                 C   sJ   |j }|tj}|djddd}|t|| j  }| j|| S )Nr   T)keepdim)	dtypetor   float32powmeanrsqrtr"   r!   )r#   r)   input_dtypevariancer'   r'   r(   forward2   s
   zMistral3RMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)tupler!   shaper"   r#   r'   r'   r(   
extra_repr9   s   zMistral3RMSNorm.extra_repr)r   )
__name__
__module____qualname__floatr   r   Tensorr4   r8   __classcell__r'   r'   r%   r(   r   (   s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                    sL   t    || _|jj}|j| _| jjj| _tj|| jd  |dd| _	d S )Nr   Fbias)
r   r   r@   vision_configr$   spatial_merge_size
patch_sizer   Linearmerging_layer)r#   r@   r$   r%   r'   r(   r   B   s   
 zMistral3PatchMerger.__init__image_featuresimage_sizesr   c                    s    fdd|D }dd |D }|j d }g }t||D ]7\}}|| \}}	|||	|dddd}
tjjj	|
 j
 j
d}|| j
d  d }|| qtj|dd	} |}|S )
Nc                    s(   g | ]}|d   j  |d  j  fqS )r   r   )rE   ).0
image_sizer7   r'   r(   
<listcomp>L   s    z/Mistral3PatchMerger.forward.<locals>.<listcomp>c                 S   s   g | ]\}}|| qS r'   r'   )rJ   hwr'   r'   r(   rL   P   s    r*   r   r   r   )kernel_sizestridedim)r6   	enumeratesplitviewpermute	unsqueezer   r   
functionalunfoldrD   tappendcatrG   )r#   rH   rI   tokens_per_imagedpermuted_tensorimage_indeximage_tokensrM   rN   
image_gridgridr'   r7   r(   r4   K   s"   



zMistral3PatchMerger.forward)
r9   r:   r;   __doc__r   r   r   r=   r4   r>   r'   r'   r%   r(   r?   =   s    $	r?   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Mistral3MultiModalProjectorr@   c                    s   t    t|jj|jjd| _t|| _	t
|jtrdnt|j| _tj|jj| j |jj|jd| _t|j | _tj|jj|jj|jd| _d S )N)r   r   rA   )r   r   r   rC   r$   text_configrms_norm_epsnormr?   patch_merger
isinstancevision_feature_layerintlennum_feature_layersr   rF   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2r#   r@   r%   r'   r(   r   d   s   

z$Mistral3MultiModalProjector.__init__rH   rI   c                 C   s8   |  |}| ||}| |}| |}| |}|S N)rh   ri   rp   rr   rs   )r#   rH   rI   r)   r'   r'   r(   r4   v   s   



z#Mistral3MultiModalProjector.forward)	r9   r:   r;   r   r   r   r=   r4   r>   r'   r'   r%   r(   re   c   s    re   zT
    Base class for Mistral3 causal language model (or autoregressive) outputs.
    custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZejdB ed< dS )	Mistral3CausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valuesr)   
attentionsimage_hidden_states)r9   r:   r;   rd   ry   r   FloatTensor__annotations__rz   r{   r   r)   r5   r|   r}   r'   r'   r'   r(   rx      s   
 rx   zM
    Base class for Mistral3 outputs, with hidden states and attentions.
    c                   @   s$   e Zd ZU dZdZejdB ed< dS )Mistral3ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nr}   )r9   r:   r;   rd   r}   r   r~   r   r'   r'   r'   r(   r      s   
 r   c                   @   s:   e Zd ZU eed< dZdZdZdZdZ	dZ
dZdZdZdS )Mistral3PreTrainedModelr@   model)imagetextTr{   N)r9   r:   r;   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr'   r'   r'   r(   r      s   
 r   zx
    The Mistral3 model which consists of a vision backbone and a language model, without a language modeling head.
    c                "       sh  e Zd ZddiZdef fddZdd Zdd	 Zee	e
d
d		d#dejdejdeee B dB dedB dee deeB fddZdejdejdejfddZee
													d$dejdB dejdB dejdB dejdB dedB dejdB deee B dB dedB dedB dedB dedB d ejdB dejdB dee deeB fd!d"Z  ZS )%Mistral3Model^language_model.modellanguage_modelr@   c                    s>   t  | t|j| _t|| _t|j| _	| 
  d S ru   )r   r   r   from_configrC   vision_towerre   multi_modal_projectorrf   r   	post_initrt   r%   r'   r(   r      s
   
zMistral3Model.__init__c                 C   
   | j  S ru   )r   get_input_embeddingsr7   r'   r'   r(   r         
z"Mistral3Model.get_input_embeddingsc                 C      | j | d S ru   )r   set_input_embeddingsr#   valuer'   r'   r(   r         z"Mistral3Model.set_input_embeddingszWObtains image last hidden states from the vision tower and apply multimodal projection.rv   Npixel_valuesrI   rk   output_hidden_stateskwargsr   c                    s   dd |  D }| j|f|ddd| t|tr! j| }n fdd|D }tj|dd}| |d	|}| jj	| j
j }	tj||jd
|	 jdd }
t|d	|
}| _ S )Nc                 S   s   i | ]\}}|d ur||qS ru   r'   )rJ   kvr'   r'   r(   
<dictcomp>   s    z4Mistral3Model.get_image_features.<locals>.<dictcomp>T)rI   r   return_dictc                    s   g | ]} j | qS r'   )r)   )rJ   	layer_idximage_outputsr'   r(   rL      s    z4Mistral3Model.get_image_features.<locals>.<listcomp>r*   rQ   r   )device)itemsr   rj   rl   r)   r   r\   r   squeezerE   r@   rD   	as_tensorr   prodtolistrT   pooler_output)r#   r   rI   rk   r   r   selected_image_featurehs_poolrH   downsample_ratiosplit_sizesr'   r   r(   get_image_features   s*   
	z Mistral3Model.get_image_features	input_idsinputs_embedsrH   c                 C   s   |du r||   tj| jjtj|jdk}|d}n|| jjk}| }|j	d |j	d  }|
d||j}t||  | kd| d|  |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        N)r,   r   r*   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r   tensorr@   image_token_idlongr   allsumr6   rW   	expand_asr-   r   numel)r#   r   r   rH   special_image_maskn_image_tokensn_image_featuresr'   r'   r(   get_placeholder_mask  s   z"Mistral3Model.get_placeholder_maskattention_maskposition_idsr{   	use_cacheoutput_attentionsr   cache_positionc                 K   s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d u |d uA r*td|d u r4|  |}|d ur]| j|||ddj}tj	|dd
|j|j}| j|||d}|||}| jd	||||||	|
d|d	|}t|j|j|j|j|d ur|dS d dS )
Nz:You must specify exactly one of input_ids or inputs_embedsT)r   rk   rI   r   r   rQ   )r   rH   )	r   r   r{   r   r   r   r   r   r   )last_hidden_stater{   r)   r|   r}   r'   )r@   r   r   use_return_dict
ValueErrorr   r   r   r   r\   r-   r   r,   r   masked_scatterr   r   r   r{   r)   r|   )r#   r   r   r   r   r{   r   rk   r   r   r   r   r   rI   r   rH   r   outputsr'   r'   r(   r4     sX   

zMistral3Model.forward)NN)NNNNNNNNNNNNN)r9   r:   r;   _checkpoint_conversion_mappingr   r   r   r   r   r   r   r   r~   r=   rl   listboolr   r   r5   r
   r   
LongTensorr   r   r   r4   r>   r'   r'   r%   r(   r      s    #
	
r   zV
    The MISTRAL3 model which consists of a vision backbone and a language model.
    c                $       st  e Zd ZdddddZddiZdef fd	d
Zdd Zdd Zde	j
fddZe	d+dejdejdeee B dB dee deeB f
ddZee														d,dejdB dejdB dejdB dejdB dedB dejdB dejdB d edB d!edB d"edB d#edB d$ejdB d%eejB dejdB dee deeB f d&d'Z							(d- fd)d*	Z  ZS ). Mistral3ForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)r   z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr@   c                    s<   t  | t|| _tj|jj|jjdd| _	| 
  d S )NFrA   )r   r   r   r   r   rF   rf   r$   
vocab_sizer   r   rt   r%   r'   r(   r   j  s   
z)Mistral3ForConditionalGeneration.__init__c                 C   r   ru   )r   r   r7   r'   r'   r(   r   p  r   z5Mistral3ForConditionalGeneration.get_input_embeddingsc                 C   r   ru   )r   r   r   r'   r'   r(   r   s  r   z5Mistral3ForConditionalGeneration.set_input_embeddingsr   c                 C   s   | j S ru   )r   r7   r'   r'   r(   get_output_embeddingsv  s   z6Mistral3ForConditionalGeneration.get_output_embeddingsNr   rI   rk   r   c                 K   s   | j jd|||d|S )N)r   rI   rk   r'   )r   r   )r#   r   rI   rk   r   r'   r'   r(   r   y  s   z3Mistral3ForConditionalGeneration.get_image_featuresr   r   r   r   r{   r   labelsr   r   r   r   r   logits_to_keepc                 K   s   |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}| jd||||||||	|
d||d|}|d }t|trBt| dn|}| |dd|ddf }d}|durg| j	d||| j j
jd|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   r   r   r   r{   r   r   r   r   r   r   rI   r   )rz   r   r   )ry   rz   r{   r)   r|   r}   r'   )r@   r   r   r   r   rj   rl   slicer   loss_functionrf   r   rx   r{   r)   r|   r}   )r#   r   r   r   r   r{   r   r   r   r   r   r   r   r   rI   r   r   r)   slice_indicesrz   ry   r'   r'   r(   r4     sL   1z(Mistral3ForConditionalGeneration.forwardFc	              	      s>   t  j|f||||||d|	}
|s|	dds||
d< |
S )N)r{   r   r   r   r   is_first_iterationr   Tr   )r   prepare_inputs_for_generationget)r#   r   r{   r   r   r   r   r   r   r   model_inputsr%   r'   r(   r     s   z>Mistral3ForConditionalGeneration.prepare_inputs_for_generationru   )NNNNNNNNNNNNr   N)NNNNNNF)r9   r:   r;   r   _tied_weights_keysr   r   r   r   r   Moduler   r   r   r~   r=   rl   r   r   r   r5   r
   r   r   r   r   r   rx   r4   r   r>   r'   r'   r%   r(   r   \  s    	
\r   )r   r   r   ))dataclassesr   r   r   activationsr   cache_utilsr   
generationr   integrationsr   modeling_outputsr	   r
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   r   autor   configuration_mistral3r   r   r   r?   re   rx   r   r   r   r   __all__r'   r'   r'   r(   <module>   sR   &  &