o
    eiJ                     @   s.  d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. dej/de0fddZ1eeG dd deZ2G dd de(Z3G dd  d e'Z4G d!d" d"e"Z5G d#d$ d$e!Z6G d%d& d&e+Z7G d'd( d(eZ8G d)d* d*eZ9G d+d, d,e*Z:G d-d. d.ej;Z<G d/d0 d0ej=Z>G d1d2 d2eZ?G d3d4 d4e?Z@G d5d6 d6e%ZAeG d7d8 d8e$eZBg d9ZCdS ):    N)	dataclass)nn   )initialization)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfiglogitsdimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NT)keepdimr   )memory_formatg      ?)softmaxmaxtorch
zeros_likelegacy_contiguous_formatscatter_detach)r    r!   y_softindexy_hardret r/   e/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmax'   s
   
r1   c                   @   s$   e Zd ZU dZdZejdB ed< dS )*BaseModelOutputWithVisualIndicatorFeaturesz
    visual_indicator_features (`torch.FloatTensor` of shape `(batch_size, visual_indicator_size)`):
        Visual indicator features extracted from the model, which can be used for auxiliary tasks or further processing.
    Nvisual_indicator_features)__name__
__module____qualname____doc__r3   r&   FloatTensor__annotations__r/   r/   r/   r0   r2   1   s   
 r2   c                   @      e Zd ZdS )Ovis2ModelOutputWithPastNr4   r5   r6   r/   r/   r/   r0   r;   <       r;   c                   @   r:   )Ovis2CausalLMOutputWithPastNr<   r/   r/   r/   r0   r>   @   r=   r>   c                   @   r:   )Ovis2RMSNormNr<   r/   r/   r/   r0   r?   D   r=   r?   c                   @   r:   )Ovis2VisionMLPNr<   r/   r/   r/   r0   r@   H   r=   r@   c                       s@   e Zd Zdef fddZdd Zdejdejfdd	Z	  Z
S )
Ovis2VisionEmbeddingsconfigc                    s    t  | t|j|j| _d S N)super__init__r?   hidden_sizerms_norm_epsrms_normselfrB   	__class__r/   r0   rE   M   s   zOvis2VisionEmbeddings.__init__c                 C   s   t d)NzNot needed for Ovis2)NotImplementedError)rJ   r/   r/   r0   interpolate_pos_encodingQ   s   z.Ovis2VisionEmbeddings.interpolate_pos_encodingpixel_valuesreturnc                 C   sL   | j jj}|  |j|d}|ddd}| |}|| | j }|S )Ndtyper   r   )	patch_embeddingweightrR   toflatten	transposerH   position_embeddingposition_ids)rJ   rO   target_dtypepatch_embeds
embeddingsr/   r/   r0   forwardT   s   

zOvis2VisionEmbeddings.forward)r4   r5   r6   r   rE   rN   r&   r8   Tensorr]   __classcell__r/   r/   rK   r0   rA   L   s    rA   c                   @   r:   )Ovis2VisionAttentionNr<   r/   r/   r/   r0   r`   _   r=   r`   c                       s"   e Zd Zdef fddZ  ZS )Ovis2VisionEncoderLayerrB   c                    s   t    t|| _d S rC   )rD   rE   r`   	attentionrI   rK   r/   r0   rE   d   s   
z Ovis2VisionEncoderLayer.__init__)r4   r5   r6   r   rE   r_   r/   r/   rK   r0   ra   c   s    ra   c                	       sN   e Zd Zdef fddZee	d
dejdB de	e
 defdd	Z  ZS )Ovis2VisionEncoderrB   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS r/   )ra   ).0_rB   r/   r0   
<listcomp>l   s    z/Ovis2VisionEncoder.__init__.<locals>.<listcomp>)rD   rE   r   
ModuleListrangenum_hidden_layerslayersrI   rK   rf   r0   rE   j   s   $zOvis2VisionEncoder.__init__Nattention_maskkwargsrP   c                 K   s,   |}| j D ]}|||fi |}qt|dS )Nlast_hidden_state)rk   r   )rJ   inputs_embedsrl   rm   hidden_statesencoder_layerr/   r/   r0   r]   n   s   

zOvis2VisionEncoder.forwardrC   )r4   r5   r6   r   rE   r   r   r&   r^   r   r   r   r]   r_   r/   r/   rK   r0   rc   i   s    rc   c                       s>   e Zd Zdef fddZe	ddejdB fddZ  Z	S )	Ovis2VisionTransformerrB   c                    s>   t    || _t|| _t|| _t|j|j	| _
d| _d S )NF)rD   rE   rB   rA   r\   rc   encoderr?   rF   rG   rH   gradient_checkpointingrI   rK   r/   r0   rE   ~   s   



zOvis2VisionTransformer.__init__Nrl   c                 K   s:   |  |}| jd||d|}|j}| |}t|dS )N)rp   rl   rn   r/   )r\   rt   ro   rH   r   )rJ   rO   rl   rm   rq   encoder_outputsro   r/   r/   r0   r]      s   


zOvis2VisionTransformer.forwardrC   )
r4   r5   r6   r   rE   r   r&   r^   r]   r_   r/   r/   rK   r0   rs   }   s    rs   c                       s*   e Zd Zdejdejf fddZ  ZS )Ovis2VisualEmbeddingTablevisual_tokensrP   c                    s8   |j tjtjtjtjtjfv rt |S t	|| j
S rC   )rR   r&   int8int16int32int64longrD   r]   matmulrT   )rJ   rx   rK   r/   r0   r]      s   z!Ovis2VisualEmbeddingTable.forward)r4   r5   r6   r&   r^   r]   r_   r/   r/   rK   r0   rw      s    "rw   c                       sT   e Zd ZU eed< dZdZdZdgZdZ	dZ
dZdZdZdZdZ fddZ  ZS )	Ovis2PreTrainedModelrB   model)imagetextTr`   past_key_valuesc                    s@   t  | t|trt|jt|jj	d 
d d S d S )N)r   r   )rD   _init_weights
isinstancerA   initcopy_rY   r&   arangeshapeexpand)rJ   modulerK   r/   r0   r      s   
&z"Ovis2PreTrainedModel._init_weights)r4   r5   r6   r   r9   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   r_   r/   r/   rK   r0   r      s   
 r   c                	       s^   e Zd ZU eed< eedZdef fddZe	e
dejdee deeB fdd	Z  ZS )
Ovis2VisionModelrB   )rq   
attentionsc                    st   t  | || _t|| _|j| _|j| _tj|j	|j
 |j
 | j| j dd| _t| j| j | _|   d S NF)bias)rD   rE   rB   rs   transformernum_visual_indicator_tokens
vocab_sizer   LinearrF   hidden_stridehead_linear	LayerNorm	head_norm	post_initrI   rK   r/   r0   rE      s   

zOvis2VisionModel.__init__rO   rm   rP   c              	   K   sN  | j |fi |}|d }| jjdkrl|j\}}}| jj}tt|}	|	|	 |kr.td||	|  | }
tj	
|ddd|
d|
fdd}|	|
7 }	|||	| ||	| ||}|dddddd}||d	|| | }| |}| |}| jjd
krtj	j|d	dd}n| jjdkrt|d	d}n| jjdkrtj	j|d	d}t||dS )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         r   gumbel_argmaxT)r!   hard	st_argmaxr!   r$   )ro   pooler_output)r   rB   r   r   intmathsqrt
ValueErrorr   
functionalpadreshapepermuter   r   tokenize_functiongumbel_softmaxr1   r$   r2   )rJ   rO   rm   outputsro   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer    
prob_tokenr/   r/   r0   r]      s<   

zOvis2VisionModel.forward)r4   r5   r6   r   r9   ra   r`   _can_record_outputsrE   r   r   r&   r8   r   r   tupler2   r]   r_   r/   r/   rK   r0   r      s   
 r   c                        s  e Zd Zi Zdef fddZeedddej	de
e deeB fd	d
Zee													ddejdB dej	dB dejdB dejdB dedB dej	dB dejdB dedB dedB dedB dedB dejdB deejB deeB fddZ  ZS )
Ovis2ModelrB   c                    sZ   t  | t|j| _t|jj|j| _|jj| _	|j| _|j
| _
t|j| _| `d S rC   )rD   rE   r   vision_configvision_towerrw   r   rF   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorrI   rK   r/   r0   rE      s   
zOvis2Model.__init__zWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introrO   rm   rP   c           
      K   s   | j |fddi|}|j}|j\}}}tj||| j jf|j|jd|jd}tj	||gdd}| 
|}tj| j| j j | jtjd|j}	||_| 
|	|_|S )Nreturn_dictTF)rR   devicerequires_gradlayoutr   r   rQ   )r   r   r   r&   zerosr   rR   r   r   catr   r   r   r}   rU   r3   )
rJ   rO   rm   image_outputsimage_features
batch_sizeimg_seq_lenre   padding_tensorvisual_indicatorr/   r/   r0   get_image_features  s,   	
zOvis2Model.get_image_featuresNr   	input_idsrl   rY   r   rp   labels	use_cacheoutput_attentionsoutput_hidden_statesr   cache_positionlogits_to_keepc                 K   sd  |	d ur|	n| j j}	|
d ur|
n| j j}
|d u |d uA r td|d u r*|  |}|d ur| j|dd}|j}|j}| j|||d}|	||}t
| jD ];\}}|d u rl||  tj|tj|jdk}|d}n||k|j}| r|| || |j|j||< qN| jd	||||||	|
d||d
|}t|j|j|j|j|d ur|dS d dS )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rO   r   )rp   r   )rR   r   r   )
rl   rY   r   rp   r   r   r   r   r   r   )ro   r   rq   r   image_hidden_statesr/   )rB   r   r   r   get_input_embeddingsr   r   r3   get_placeholder_maskmasked_scatter	enumerater   r&   tensorr}   r   allrU   any	expand_asrR   r   r;   ro   r   rq   r   )rJ   r   rO   rl   rY   r   rp   r   r   r   r   r   r   r   rm   r   r   r3   special_image_maskivisual_indicator_idmaskr   r/   r/   r0   r]   &  sj   

zOvis2Model.forwardNNNNNNNNNNNNr   )r4   r5   r6   _checkpoint_conversion_mappingr   rE   r   r   r&   r8   r   r   r   r2   r   
LongTensorr^   r   boolr   r;   r]   r_   r/   r/   rK   r0   r      st    	
r   c                        s   e Zd Zi Zdef fddZedejde	e
 deeB fddZee																									
ddejd	B dejd	B dejd	B dejd	B ded	B dejd	B dejd	B ded	B ded	B ded	B ded	B dejd	B deejB deeB fddZ  ZS )Ovis2ForConditionalGenerationrB   c                    s&   t  | tj|j|jdd| _d S r   )rD   rE   r   r   rF   r   lm_headrI   rK   r/   r0   rE   {  s   z&Ovis2ForConditionalGeneration.__init__rO   rm   rP   c                 K   s   | j jdd|i|S )NrO   r/   )r   r   )rJ   rO   rm   r/   r/   r0   r     s   z0Ovis2ForConditionalGeneration.get_image_featuresNr   r   rl   rY   r   rp   r   r   r   r   r   r   r   c                 K   s   |	dur|	n| j j}	|
dur|
n| j j}
| jd||||||||	|
d|d|}|d }t|tr7t| dn|}| |dd|ddf }d}|dur\| jd||| j j	j
d|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)r   rO   rl   rY   r   rp   r   r   r   r   r   r   )r    r   r   )lossr    r   rq   r   r   r/   )rB   r   r   r   r   r   slicer   loss_functionr   r   r>   r   rq   r   r   )rJ   r   rO   rl   rY   r   rp   r   r   r   r   r   r   r   rm   r   rq   slice_indicesr    r   r/   r/   r0   r]     sH   0z%Ovis2ForConditionalGeneration.forwardr   )r4   r5   r6   r   r   rE   r   r&   r8   r   r   r   r2   r   r   r   r^   r   r   r   r>   r]   r_   r/   r/   rK   r0   r   w  sn    	
r   )r   r   r   )Dr   dataclassesr   r&   r    r   r   cache_utilsr   
generationr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   aimv2.modeling_aimv2r   r   autor   llama.modeling_llamar   r   llava.modeling_llavar   r   llava_next.modeling_llava_nextr   r   siglip.modeling_siglipr   r   configuration_ovis2r   r   r^   r   r1   r2   r;   r>   r?   r@   rA   r`   ra   rc   Modulers   	Embeddingrw   r   r   r   r   __all__r/   r/   r/   r0   <module>   sP   
	Af