o
    ei6                     @   s6  d dl Z d dl mZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ dd	lmZmZ d
dlmZmZmZmZmZmZ d
dlmZ ddlmZ eeZG dd deZG dd dejZ G dd dejZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&g dZ'dS )     N)nn   )ACT2FN)Cache)BaseModelOutputWithPooling)Unpack)auto_docstringlogging)can_return_tuplemerge_with_config_defaults   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModelTransformersKwargs)MistralRMSNorm   )Mistral3Configc                   @      e Zd ZdS )Mistral3RMSNormN__name__
__module____qualname__ r   r   k/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mistral3/modular_mistral3.pyr   (       r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                    sL   t    || _|jj}|j| _| jjj| _tj|| jd  |dd| _	d S )Nr   Fbias)
super__init__r    vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr    r&   	__class__r   r   r$   1   s   
 zMistral3PatchMerger.__init__image_featuresimage_sizesreturnc                    s    fdd|D }dd |D }|j d }g }t||D ]7\}}|| \}}	|||	|dddd}
tjjj	|
 j
 j
d}|| j
d  d }|| qtj|dd	} |}|S )
Nc                    s(   g | ]}|d   j  |d  j  fqS )r   r   )r(   ).0
image_sizer+   r   r   
<listcomp>;   s    z/Mistral3PatchMerger.forward.<locals>.<listcomp>c                 S   s   g | ]\}}|| qS r   r   )r1   hwr   r   r   r4   ?   s    r   r   r   )kernel_sizestridedim)shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr'   tappendcatr*   )r+   r.   r/   tokens_per_imagedpermuted_tensorimage_indeximage_tokensr5   r6   
image_gridgridr   r3   r   forward:   s"   



zMistral3PatchMerger.forward)
r   r   r   __doc__r   r$   rB   TensorrO   __classcell__r   r   r,   r   r   ,   s    $	r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )Mistral3MultiModalProjectorr    c                    s   t    t|jj|jjd| _t|| _	t
|jtrdnt|j| _tj|jj| j |jj|jd| _t|j | _tj|jj|jj|jd| _d S )N)epsr   r!   )r#   r$   r   r%   r&   text_configrms_norm_epsnormr   patch_merger
isinstancevision_feature_layerintlennum_feature_layersr   r)   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r+   r    r,   r   r   r$   S   s   

z$Mistral3MultiModalProjector.__init__r.   r/   c                 C   s8   |  |}| ||}| |}| |}| |}|S N)rW   rX   r_   ra   rb   )r+   r.   r/   hidden_statesr   r   r   rO   e   s   



z#Mistral3MultiModalProjector.forward)	r   r   r   r   r$   rB   rQ   rO   rR   r   r   r,   r   rS   R   s    rS   c                   @   r   )Mistral3CausalLMOutputWithPastNr   r   r   r   r   re   n   r   re   c                   @   r   )Mistral3ModelOutputWithPastNr   r   r   r   r   rf   r   r   rf   c                   @   r   )Mistral3PreTrainedModelNr   r   r   r   r   rg   v   r   rg   c                "   @   s  e Zd Zeeedd		ddejdejde	e
e	 B dB dedB dee d	eeB fd
dZee													ddejdB dejdB dejdB dejdB dedB dejdB de	e
e	 B dB dedB dedB dedB dedB dejdB dejdB dee d	eeB fddZdS )Mistral3ModelzWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introNpixel_valuesr/   rZ   output_hidden_stateskwargsr0   c                    s   dd |  D }| j|f|ddd| t|tr! j| }n fdd|D }tj|dd}| |d	|}| jj	| j
j }	tj||jd
|	 jdd }
t|d	|
}| _ S )Nc                 S   s   i | ]\}}|d ur||qS rc   r   )r1   kvr   r   r   
<dictcomp>   s    z4Mistral3Model.get_image_features.<locals>.<dictcomp>T)r/   rk   return_dictc                    s   g | ]} j | qS r   )rd   )r1   	layer_idximage_outputsr   r   r4      s    z4Mistral3Model.get_image_features.<locals>.<listcomp>r7   r:   r   )device)itemsvision_towerrY   r[   rd   rB   rG   multi_modal_projectorsqueezer(   r    r'   	as_tensorrt   prodtolistr>   pooler_output)r+   rj   r/   rZ   rk   rl   selected_image_featurehs_poolr.   downsample_ratiosplit_sizesr   rr   r   get_image_features{   s*   
	z Mistral3Model.get_image_features	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrp   cache_positionc                 K   s  |	d ur|	n| j j}	|
d ur|
n| j j}
|d ur|n| j j}|d u |d uA r*td|d u r4|  |}|d ur]| j|||ddj}tj	|dd
|j|j}| j|||d}|||}| jd	||||||	|
d|d	|}t|j|j|j|j|d ur|dS d dS )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rj   rZ   r/   rp   r   r:   )r   r.   )	r   r   r   r   r   r   rk   rp   r   )last_hidden_stater   rd   
attentionsimage_hidden_statesr   )r    r   rk   use_return_dict
ValueErrorget_input_embeddingsr   r|   rB   rG   tort   dtypeget_placeholder_maskmasked_scatterlanguage_modelrf   r   r   rd   r   )r+   r   rj   r   r   r   r   rZ   r   r   rk   rp   r   r/   rl   r.   special_image_maskoutputsr   r   r   rO      sX   

zMistral3Model.forward)NN)NNNNNNNNNNNNN)r   r   r   r
   r   r   rB   FloatTensorrQ   r[   listboolr   r   tupler   r   
LongTensorr   rf   rO   r   r   r   r   rh   z   s    #	
rh   c                "   @   s  e Zd Ze	ddejdejdeee B dB de	e
 deeB f
ddZ															dd
ejdB dejdB dejdB dejdB dedB dejdB dejdB dedB dedB dedB dedB dejdB deejB dejdB de	e
 deeB f ddZdS ) Mistral3ForConditionalGenerationNrj   r/   rZ   rl   r0   c                 K   s   | j jd|||d|S )N)rj   r/   rZ   r   )modelr   )r+   rj   r/   rZ   rl   r   r   r   r      s   z3Mistral3ForConditionalGeneration.get_image_featuresr   r   r   r   r   r   labelsr   r   rk   rp   r   logits_to_keepc                 K   s   |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}| jd||||||||	|
d||d|}|d }t|trBt| dn|}| |dd|ddf }d}|durg| j	d||| j j
jd|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   rj   r   r   r   r   r   r   rk   rp   r   r/   r   )logitsr   
vocab_size)lossr   r   rd   r   r   r   )r    r   rk   r   r   rY   r[   slicelm_headloss_functionrU   r   re   r   rd   r   r   )r+   r   rj   r   r   r   r   r   r   r   rk   rp   r   r   r/   rl   r   rd   slice_indicesr   r   r   r   r   rO      sL   /z(Mistral3ForConditionalGeneration.forwardrc   )NNNNNNNNNNNNr   N)r   r   r   r   rB   r   rQ   r[   r   r   r   r   r   r   r   r   r   re   rO   r   r   r   r   r      sz    	
r   )rh   rg   r   )(rB   r   activationsr   cache_utilsr   modeling_outputsr   processing_utilsr   utilsr   r	   utils.genericr
   r   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler   rS   re   rf   rg   rh   r   __all__r   r   r   r   <module>   s*    
&kj