o
    	۷iXC                     @   s  d dl Z d dlmZmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' dej(de)fddZ*G dd de!Z+G dd de Z,G dd deZ-G dd deZ.G d d! d!e$Z/G d"d# d#eZ0G d$d% d%eZ1G d&d' d'e#Z2G d(d) d)ej3Z4G d*d+ d+ej5Z6G d,d- d-eZ7G d.d/ d/e7Z8G d0d1 d1eZ9eG d2d3 d3ee	Z:g d4Z;dS )5    N)OptionalUnion)nn   )Cache)GenerationMixin)BaseModelOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfiglogitsdimc                 C   sJ   |  |}|j|ddd }tj| tjd||d}||  | }|S )NT)keepdimr   )memory_formatg      ?)softmaxmaxtorch
zeros_likelegacy_contiguous_formatscatter_detach)r   r   y_softindexy_hardret r,   ]/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmax%   s
   
r.   c                   @      e Zd ZdS )Ovis2ModelOutputWithPastN__name__
__module____qualname__r,   r,   r,   r-   r0   /       r0   c                   @   r/   )Ovis2CausalLMOutputWithPastNr1   r,   r,   r,   r-   r6   3   r5   r6   c                   @   r/   )Ovis2RMSNormNr1   r,   r,   r,   r-   r7   7   r5   r7   c                   @   r/   )Ovis2VisionMLPNr1   r,   r,   r,   r-   r8   ;   r5   r8   c                       s@   e Zd Zdef fddZdd Zdejdejfdd	Z	  Z
S )
Ovis2VisionEmbeddingsconfigc                    s    t  | t|j|j| _d S N)super__init__r7   hidden_sizerms_norm_epsrms_normselfr:   	__class__r,   r-   r=   @   s   zOvis2VisionEmbeddings.__init__c                 C      t dNzNot needed for Ovis2)NotImplementedErrorrB   r,   r,   r-   interpolate_pos_encodingD   s   z.Ovis2VisionEmbeddings.interpolate_pos_encodingpixel_valuesreturnc                 C   sL   | j jj}|  |j|d}|ddd}| |}|| | j }|S )Ndtyper   r   )	patch_embeddingweightrM   toflatten	transposer@   position_embeddingposition_ids)rB   rJ   target_dtypepatch_embeds
embeddingsr,   r,   r-   forwardG   s   

zOvis2VisionEmbeddings.forward)r2   r3   r4   r   r=   rI   r#   FloatTensorTensorrX   __classcell__r,   r,   rC   r-   r9   ?   s    r9   c                   @   r/   )Ovis2VisionAttentionNr1   r,   r,   r,   r-   r\   R   r5   r\   c                   @   r/   )Ovis2VisionEncoderLayerNr1   r,   r,   r,   r-   r]   V   r5   r]   c                	       sN   e Zd Zdef fddZee	d
deej	 de
e defdd	Z  ZS )Ovis2VisionEncoderr:   c                    s0   t    t fddt jD | _d S )Nc                    s   g | ]}t  qS r,   )r]   ).0_r:   r,   r-   
<listcomp>]   s    z/Ovis2VisionEncoder.__init__.<locals>.<listcomp>)r<   r=   r   
ModuleListrangenum_hidden_layerslayersrA   rC   ra   r-   r=   [   s   $zOvis2VisionEncoder.__init__Nattention_maskkwargsrK   c                 K   s,   |}| j D ]}|||fi |}qt|dS )Nlast_hidden_state)rf   r   )rB   inputs_embedsrg   rh   hidden_statesencoder_layerr,   r,   r-   rX   _   s   

zOvis2VisionEncoder.forwardr;   )r2   r3   r4   r   r=   r   r   r   r#   rZ   r
   r   r   rX   r[   r,   r,   rC   r-   r^   Z   s    r^   c                       s>   e Zd Zdef fddZe	ddeej fddZ	  Z
S )	Ovis2VisionTransformerr:   c                    s>   t    || _t|| _t|| _t|j|j	| _
d| _d S )NF)r<   r=   r:   r9   rW   r^   encoderr7   r>   r?   r@   gradient_checkpointingrA   rC   r,   r-   r=   o   s   



zOvis2VisionTransformer.__init__Nrg   c                 K   s:   |  |}| jd||d|}|j}| |}t|dS )N)rk   rg   ri   r,   )rW   ro   rj   r@   r   )rB   rJ   rg   rh   rl   encoder_outputsrj   r,   r,   r-   rX   w   s   


zOvis2VisionTransformer.forwardr;   )r2   r3   r4   r   r=   r   r   r#   rZ   rX   r[   r,   r,   rC   r-   rn   n   s    rn   c                       s*   e Zd Zdejdejf fddZ  ZS )Ovis2VisualEmbeddingTablevisual_tokensrK   c                    s8   |j tjtjtjtjtjfv rt |S t	|| j
S r;   )rM   r#   int8int16int32int64longr<   rX   matmulrO   )rB   rs   rC   r,   r-   rX      s   z!Ovis2VisualEmbeddingTable.forward)r2   r3   r4   r#   rZ   rX   r[   r,   r,   rC   r-   rr      s    "rr   c                   @   s@   e Zd ZU eed< dZdZdgZdZdZ	dZ
dZdZdZdZdS )Ovis2PreTrainedModelr:   modelTr\   past_key_valuesN)r2   r3   r4   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr,   r,   r,   r-   rz      s   
 rz   c                       sL   e Zd ZU eed< def fddZdejdeej	ej	f fddZ
  ZS )Ovis2VisionModelr:   c                    sl   t  | || _t|| _|j| _|j| _tj|j	|j
 |j
 | j| j dd| _t| j| j | _d S NF)bias)r<   r=   r:   rn   transformernum_visual_indicator_tokens
vocab_sizer   Linearr>   hidden_stridehead_linear	LayerNorm	head_normrA   rC   r,   r-   r=      s   

zOvis2VisionModel.__init__rJ   rK   c              	   K   sJ  | j |fi |}|d }| jjdkrl|j\}}}| jj}tt|}	|	|	 |kr.td||	|  | }
tj	
|ddd|
d|
fdd}|	|
7 }	|||	| ||	| ||}|dddddd}||d	|| | }| |}| |}| jjd
krtj	j|d	dd}|S | jjdkrt|d	d}|S | jjdkrtj	j|d	d}|S )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         gumbel_argmaxT)r   hard	st_argmaxr   r!   )r   r:   r   shapeintmathsqrt
ValueErrorr   
functionalpadreshapepermuter   r   tokenize_functiongumbel_softmaxr.   r!   )rB   rJ   rh   outputsrj   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer   
prob_tokenr,   r,   r-   rX      s:   

zOvis2VisionModel.forward)r2   r3   r4   r   r}   r=   r#   rY   tuplerZ   rX   r[   r,   r,   rC   r-   r      s   
 (r   c                !       s   e Zd Zi Zdef fddZdejdejfddZe	e
														dd
eej deej deej deej dee deej deej dee dee dee dee deej deeejf deeef fddZ  ZS )
Ovis2Modelr:   c                    sZ   t  | t|j| _t|jj|j| _|jj| _	|j| _|j
| _
t|j| _| `d S r;   )r<   r=   r   vision_configvision_towerrr   r   r>   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorrA   rC   r,   r-   r=      s   
zOvis2Model.__init__rJ   rK   c           	      C   s   |  |}|j\}}}tj||| j jf|j|jd|jd}tj||gdd}| 	|}tj
| j| j j | jtjd|j}| 	|}||fS )NF)rM   devicerequires_gradlayoutr   r   rL   )r   r   r#   zerosr   rM   r   r   catr   aranger   rx   rP   )	rB   rJ   image_features
batch_sizeimg_seq_lenr`   padding_tensorvisual_indicatorvisual_indicator_featuresr,   r,   r-   get_image_features   s(   


zOvis2Model.get_image_featuresNr   	input_idsrg   rT   r|   rk   labels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionlogits_to_keepc                 K   sZ  |	d ur|	n| j j}	|
d ur|
n| j j}
|d u |d uA r td|d u r*|  |}|d ur| j|d\}}| j|||d}|||}t| j	D ];\}}|d u rg||  t
j|t
j|jdk}|d}n||k|j}| r|| || |j|j||< qI| jd	||||||	|
d||d
|}t|j|j|j|j|d ur|dS d dS )
Nz:You must specify exactly one of input_ids or inputs_embedsrJ   )rk   r   )rM   r   r   T)
rg   rT   r|   rk   r   r   r   r   r   r   )rj   r|   rl   
attentionsimage_hidden_statesr,   )r:   r   r   r   get_input_embeddingsr   get_placeholder_maskmasked_scatter	enumerater   r#   tensorrx   r   allrP   any	expand_asrM   r   r0   rj   r|   rl   r   )rB   r   rJ   rg   rT   r|   rk   r   r   r   r   r   r   r   rh   r   r   special_image_maskivisual_indicator_idmaskr   r,   r,   r-   rX      sf   

zOvis2Model.forwardNNNNNNNNNNNNr   )r2   r3   r4   _checkpoint_conversion_mappingr   r=   r#   rY   r   r   r   r   
LongTensorrZ   r   boolr   r   r   r0   rX   r[   r,   r,   rC   r-   r      sh    
	

r   c                !       s   e Zd Zi Zdef fddZedd Zdej	fddZ
ee																									
ddeej deej	 deej deej dee deej	 deej dee dee dee dee deej deeejf deeef fddZ  ZS )Ovis2ForConditionalGenerationr:   c                    s&   t  | tj|j|jdd| _d S r   )r<   r=   r   r   r>   r   lm_headrA   rC   r,   r-   r=   P  s   z&Ovis2ForConditionalGeneration.__init__c                 C   rE   rF   )AttributeErrorrH   r,   r,   r-   r   T  s   z3Ovis2ForConditionalGeneration.multi_modal_projectorrJ   c                 C   s   | j j|dS )Nr   )r{   r   )rB   rJ   r,   r,   r-   r   X  s   z0Ovis2ForConditionalGeneration.get_image_featuresNr   r   rg   rT   r|   rk   r   r   r   r   r   r   r   rK   c                 K   s   |	dur|	n| j j}	|
dur|
n| j j}
| jd||||||||	|
d|d|}|d }t|tr7t| dn|}| |dd|ddf }d}|dur\| jd||| j j	j
d|}t|||j|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)r   rJ   rg   rT   r|   rk   r   r   r   r   r   r   )r   r   r   )lossr   r|   rl   r   r   r,   )r:   r   r   r{   
isinstancer   slicer   loss_functionr   r   r6   r|   rl   r   r   )rB   r   rJ   rg   rT   r|   rk   r   r   r   r   r   r   r   rh   r   rl   slice_indicesr   r   r,   r,   r-   rX   [  sH   .z%Ovis2ForConditionalGeneration.forwardr   )r2   r3   r4   r   r   r=   propertyr   r#   rY   r   r   r   r   r   rZ   r   r   r   r   r   r6   rX   r[   r,   r,   rC   r-   r   L  sd    
	

r   )rz   r   r   )<r   typingr   r   r#   r   cache_utilsr   
generationr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   r   aimv2.modeling_aimv2r   r   autor   llama.modeling_llamar   r   llava.modeling_llavar   r   llava_next.modeling_llava_nextr   r   siglip.modeling_siglipr   r   configuration_ovis2r   r   rZ   r   r.   r0   r6   r7   r8   r9   r\   r]   r^   Modulern   	Embeddingrr   rz   r   r   r   __all__r,   r,   r,   r-   <module>   sD   
4ve