o
    ei.                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlm	  m
Z ddlm	Z	 ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 e&3e4Z5ee$ddG dd deZ6ee$ddG dd deZ7				dId d!Z8g fd"d#Z9G d$d% d%e	j:Z;G d&d' d'e	j<Z=G d(d) d)e	j>Z?G d*d+ d+ej	j>Z@d,d- ZAdJd.d/ZBG d0d1 d1e	j>ZC	2dKd3e	j>d4ejDd5ejDd6ejDd7ejDdB d8eEd9eEfd:d;ZFG d<d= d=e	j>ZGG d>d? d?eZHG d@dA dAeZIe$G dBdC dCeZJe$G dDdE dEeJZKG dFdG dGeJeZLg dHZMdS )LzPyTorch Idefics model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedConfigPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformerz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )custom_introc                   @   sv   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
eej dB ed< dZeej dB ed< dZeej dB ed< dS )IdeficsBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__r"   r	   r#   tupler$   r%    r.   r.   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/idefics/modeling_idefics.pyr    1   s   
 r    zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
dB ed< dZeej dB ed< dZeej dB ed< dZeej dB ed< dS )	IdeficsCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr"   r#   r$   r%   )r&   r'   r(   r)   r1   r*   r+   r,   r2   r"   r	   r#   r-   r$   r%   r.   r.   r.   r/   r0   R   s   
 r0   Fc                 K   sB  t | jd ddd|d| j}| d|} |d|d< |d|d< |d|d< |d|d< d|v rI|d }|d||d< |d urU|d||d	< |d d ure|d d||d< |d d ury|d d||d< | |fS |d d ur|d d||d< | |fS |d d ur|d d||d< | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r*   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderr9   encoder_outputsmodel_kwargsexpanded_return_idxr8   r.   r.   r/   expand_inputs_for_generationr   s:   ,		rH   c                    sf   t jt jt jd  fdd|D }|  D ]|r+tfdd|D r+d qd q| S )N)	LayerNormLinear	Embeddingc                    s   g | ]} | qS r.   r.   ).0m)mappingr.   r/   
<listcomp>   s    z freeze_model.<locals>.<listcomp>c                 3   s    | ]}t  |V  qd S N)
isinstance)rL   t)moduler.   r/   	<genexpr>   s    zfreeze_model.<locals>.<genexpr>TF)r   rI   rJ   rK   modulesanyrequires_grad_)modelmodule_exceptionsmodule_exceptions_mappedr.   )rN   rS   r/   freeze_model   s   r[   c                       sN   e Zd ZdZ				ddedB ddf fddZdd	 Zdefd
dZ  Z	S )IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    FNpartially_freezereturnc           	         s   |dur||krt d| d| t jd|||||d| || _|| _|| _|| _|r5| jd | jdkrGt	j
| j|||d| _dS dS )	a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimr?   dtypepadding_idxFr   )r_   r`   r?   ra   r.   )
ValueErrorsuper__init__r_   rb   num_additional_embeddingsr]   weightrW   r   rK   additional_embedding)	selfr_   rf   r`   r]   r?   ra   rb   kwargs	__class__r.   r/   re      s2   
z"IdeficsDecoupledEmbedding.__init__c                 C   sj   | j dkrt|| jS | }t|| jk}|| }| || j }d||< t|| j}|||< |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	rf   F	embeddingrg   cloner*   wherer_   rh   )ri   rB   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectorr.   r.   r/   forward   s   
z!IdeficsDecoupledEmbedding.forwardc                 C   s$   d| j  d| j d| j d| j S )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)r_   rf   r`   r]   ri   r.   r.   r/   
extra_repr  s   $z$IdeficsDecoupledEmbedding.extra_repr)FNNN)
r&   r'   r(   r)   boolre   ru   strrx   __classcell__r.   r.   rk   r/   r\      s    
5'r\   c                       sj   e Zd ZdZ					ddedededed	ed
df fddZdejd
ejfddZ	d
e
fddZ  ZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    r   TNin_featuresout_featuresout_additional_featuresbiasr]   r^   c                    sr   t  ||||| || _|| _|| _|| _|r&| jd |r&| jd |dkr7t	j
|||||d| _dS dS )aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )r}   r~   r   r?   ra   N)rd   re   r   r]   r}   r~   rg   rW   r   r   rJ   additional_fc)ri   r}   r~   r   r   r]   r?   ra   rk   r.   r/   re      s$   zIdeficsDecoupledLinear.__init__inputc                 C   s:   t || j| j}| jdkr| |}t||fd}|S )Nr   r3   )rm   linearrg   r   r   r   r*   cat)ri   r   outputadditional_featuresr.   r.   r/   ru   D  s
   

zIdeficsDecoupledLinear.forwardc              
   C   s0   d| j  d| j d| j d| jdu d| j 
S )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nrv   r}   r~   r   r   r]   rw   r.   r.   r/   rx   M  s   0z!IdeficsDecoupledLinear.extra_repr)r   TTNN)r&   r'   r(   r)   intry   re   r*   Tensorru   rz   rx   r{   r.   r.   rk   r/   r|     s,    	$	r|   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	IdeficsRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)rd   re   r   	Parameterr*   onesrg   variance_epsilon)ri   hidden_sizeepsrk   r.   r/   re   T  s   

zIdeficsRMSNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   r3   T)keepdim)r>   r*   float32powmeanrsqrtr   rg   ra   float16bfloat16)ri   r#   variancer.   r.   r/   ru   \  s
   
zIdeficsRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r-   rg   r;   r   rw   r.   r.   r/   rx   f  s   zIdeficsRMSNorm.extra_repr)r   )r&   r'   r(   re   ru   rx   r{   r.   r.   rk   r/   r   S  s    
r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )IdeficsEmbedding   '  Nc                    sz   t    || _|| _|| _d| jtjd| jdtjdj|tj	d| j   }| j
d|dd | j|| jjt d	 d S )
N      ?r   r   ra   r?   ra   inv_freqF
persistentseq_lenr?   ra   )rd   re   dimmax_position_embeddingsbaser*   r:   int64r>   floatregister_buffer_set_cos_sin_cacher   r?   get_default_dtype)ri   r   r   r   r?   r   rk   r.   r/   re   l  s   
&
zIdeficsEmbedding.__init__c                 C   s|   || _ tj| j |tjd| j}td|| j}tj||fdd}| jd|	 
|dd | jd| 
|dd d S )	Nr   i,j->ijr3   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr*   r:   r   type_asr   einsumr   r   cosr>   sin)ri   r   r?   ra   rR   freqsembr.   r.   r/   r   }  s   z#IdeficsEmbedding._set_cos_sin_cachec                 C   sN   || j kr| j||j|jd | jd | j|jd| jd | j|jdfS )Nr   r   )r   r   r?   ra   r   r>   r   )ri   xr   r.   r.   r/   ru     s
   
zIdeficsEmbedding.forward)r   r   NrP   )r&   r'   r(   re   r   ru   r{   r.   r.   rk   r/   r   k  s    
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr3   r   r   )r;   r*   r   )r   x1x2r.   r.   r/   rotate_half  s   r   c                 C   sL   ||  |}||  |}| | t| |  }|| t||  }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embedr.   r.   r/   apply_rotary_pos_emb  s
   r   c                       s2   e Zd Zdededef fddZdd Z  ZS )
IdeficsMLPr   intermediate_size
hidden_actc                    sN   t    tj||dd| _tj||dd| _tj||dd| _t| | _d S )NFr   )	rd   re   r   rJ   	gate_proj	down_projup_projr   act_fn)ri   r   r   r   rk   r.   r/   re     s
   
zIdeficsMLP.__init__c                 C   s    |  | | || | S rP   )r   r   r   r   )ri   r   r.   r.   r/   ru         zIdeficsMLP.forward)r&   r'   r(   r   rz   re   ru   r{   r.   r.   rk   r/   r     s    r           rS   querykeyvaluer9   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr3   )r   ra   ptrainingr   r   )r*   matmul	transposer   
functionalsoftmaxr   r>   ra   r   r   
contiguous)
rS   r   r   r   r9   r   r   rj   attn_weightsattn_outputr.   r.   r/   eager_attention_forward  s   
r   c                       s   e Zd ZdZ					ddedededed	edB d
ededB f fddZde	j
dedefddZ					dde	j
de	j
dB de	j
dB de	jdB dedB de	jdB dee dee	j
e	j
f fddZ  ZS )IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FNr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc           	         s  t    || _|| _|| _|| | _|| _d| _| jd | _|| _	|d u r1t
d| jj d | j| | jkrEtd| j d| d|| _ttjdsRtd	| jrt|jd
s^| jn|jj}tj| j|| j dd| _tj||| j dd| _tj||| j dd| _n'tj| j|| j dd| _tj| j|| j dd| _tj| j|| j dd| _tj|| j |dd| _t| j| _|| _| jrt| j|jd| _t| j|jd| _ d S d S )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!rd   re   r   r   r   head_dimr   	is_causalr   r   loggerwarning_oncerl   r&   rc   r   hasattrr   r   vision_configr   rJ   q_projk_projv_projo_projr   
rotary_embr   r   rms_norm_epsq_layer_normk_layer_norm)	ri   r   r   r   r   r   r   r   kv_input_dimrk   r.   r/   re     s|   




zIdeficsAttention.__init__tensorr   bszc                 C   s    | ||| j| jdd S )Nr   r   )r<   r   r   r   r   )ri   r   r   r   r.   r.   r/   _shape4  r   zIdeficsAttention._shaper#   key_value_statesr9   r   r"   cache_positionrj   r^   c                 K   s  | j p|d u}| \}	}
}| ||	|
| j| jdd}|sD| ||	|
| j| jdd}| ||	|
| j| jdd}n)| \}}}| ||	|| j| jdd}| ||	|| j| jdd}|j	d }|d ur|||d 7 }|s| j
|t||
d\}}t|||||\}}|d urd|i}|||| j|\}}| jr| |}| |}t| jjt}|| ||||f| jsdn| j| jd|\}}||	|
d	 }| |}||fS )
Nr   r   r   r   )r   r   r   )r   r   r3   )r   sizer   r<   r   r   r   r   r   r;   r   maxr   updater   r   r   r   r   get_interfacer   _attn_implementationr   r   r   r   reshaper   r   )ri   r#   r   r9   r   r"   r   rj   r   r   q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer   r   r.   r.   r/   ru   7  sP   ""$" 




zIdeficsAttention.forward)r   FNFNNNNNN)r&   r'   r(   r)   r   r   ry   r   re   r*   r   r   
LongTensorr	   r   r   r-   ru   r{   r.   r.   rk   r/   r     sZ    Q	r   c                       s~   e Zd ZddededB f fddZe				ddejdejdB dej	dB d	e
dB d
ej	dB dee dejfddZ  ZS )IdeficsDecoderLayerNr   r   c                    sr   t    |j| _t| j|j|j||d| _t| j|j|j	d| _
t|j|jd| _t|j|jd| _|j| _d S )N)r   r   r   r   r   r   r   r   r   )rd   re   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r   input_layernormpost_attention_layernormri   r   r   rk   r.   r/   re   z  s"   
zIdeficsDecoderLayer.__init__r#   r9   r   r"   r   rj   r^   c           	      K   s   |}|  |}| jd|||||d|\}}tjj|| j| jd}|| }|}| |}| |}tjj|| j| jd}|| }|S )N)r#   r9   r   r"   r   r   r.   )r  r  r   r   r   r   r  r  )	ri   r#   r9   r   r"   r   rj   residualr  r.   r.   r/   ru     s&   




zIdeficsDecoderLayer.forwardrP   )NNNN)r&   r'   r(   r   r   re   r   r*   r   r  r	   r   r   r+   ru   r{   r.   r.   rk   r/   r  y  s,    r  c                       s   e Zd ZddededB f fddZe					ddejdejdB dejdB d	ejdB d
ejdB de	dB de
e dejfddZ  ZS )IdeficsGatedCrossAttentionLayerNr   r   c              	      s~  t    |j| _t| j|jd|j||j|d| _t| j|j	|j
d| _t|j|jd| _t|j|jd| _|j| _t | _t | _|jdkr|jdkrgttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _ntd	|j d
|jdkr|jdkrttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _njtd	|j d
|jdv r$|jdkrttjd|jdd| jfd| _ttjd|jdd| jfd| _n3|jdkrttjd|jdd| _ttjd|jdd| _ntd	|j d
t d|j dt!| dr9t!| ds=tdd S )NT)r   r   r   r   r   r   r   r  r   zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr   )r   stdr   zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"rd   re   r   r   r  r   r   
cross_attnr   r   r   r  r   r   r  r  r   r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r*   r  r  r  rc   r   r  alphas_initializer_rangeNotImplementedErrorr   r  rk   r.   r/   re     sn   
	








z(IdeficsGatedCrossAttentionLayer.__init__r#   r9   r%   r7   cross_attention_gater"   rj   r^   c           
      K   s   |du rt d|du rt d|durtd|}| |}| jd	|||d|\}}	tjj|| j| jd}|	|dkdddddf d}|| 
| j|  }|}| |}| |}tjj|| j| jd}|| | j|  }|S )
a  
        image_hidden_states (`torch.FloatTensor`):
            Input to the layer of shape `(batch, seq_len, embed_dim)`
        image_attention_mask (`torch.FloatTensor`, *optional*):
            image attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        cross_attention_gate (`torch.FloatTensor`, *optional*):
            gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r#   r   r9   r   r   r   r.   )rc   r'  r  r   r   r   r   r   r   masked_fillr"  r  r  r  r#  r  )
ri   r#   r9   r%   r7   r(  r"   rj   r  r  r.   r.   r/   ru     s8   

"

z'IdeficsGatedCrossAttentionLayer.forwardrP   r  )r&   r'   r(   r   r   re   r   r*   r   r	   r   r   r+   ru   r{   r.   r.   rk   r/   r    s2    B	r  c                       sf   e Zd ZU eed< dZdZdZddgZdZ	dZ
dZdZeeedd	d
dZe  fddZ  ZS )IdeficsPreTrainedModelr   rX   )imagetextTr  r  Fr   r  )index
layer_name)r#   r$   c                    s  t  | t|tr$t|j t|jt	
|jjd d d S t|trq| jjdkr=t|j t|j d S | jjdkrQt|j t|j d S | jjdv rotj|jd| jjd tj|jd| jjd d S d S t|tr~t|j d S t|trd|jt	
d	|jd
|j   }t|j| t	
|j|}t	d||}t	j||fdd}t|j|   t|j!|"  d S d S )Nr3   )r   r3   r  r   >   r  r  r  r   )r   r  r   r   r   r   r   )#rd   _init_weightsrQ   r   initnormal_class_embeddingcopy_r   r*   r:   r;   expandr  r   r$  zeros_r  r  ones_r&  r   latentsr   r   r   r   r   r   r   r   r   r   r   r   )ri   rS   r   rR   r   r   rk   r.   r/   r/  B  s4   
&


 z$IdeficsPreTrainedModel._init_weights)r&   r'   r(   r   r,   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr  r   r   _can_record_outputsr*   no_gradr/  r{   r.   r.   rk   r/   r*  0  s   
 r*  c                !       s   e Zd ZdZdef fddZdddZg fdd	Zg fd
dZe	e
e												ddejdB dejdB dejdB dedB dejdB dejdB dejdB dejdB dejdB dedB dedB dejdB dee deeB fddZ  ZS )IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c                    s  t     | _ j| _ j| _t j j j j	| jd| _
 jj| _ j| _ j| j_t j| _ jrJ j}t  jj|j|j|j|j| _t fddt jD | _ j| _ j| j }t fddt|D | _d| _ t! j j"d| _#| $  | %  d S )N)r_   rf   r`   r]   rb   c                       g | ]}t  |d qS )r   )r  rL   ir   r.   r/   rO         z)IdeficsModel.__init__.<locals>.<listcomp>c                    rC  rD  )r  rE  rG  r.   r/   rO     rH  Fr   )&rd   re   r   pad_token_idrb   
vocab_sizer\   additional_vocab_sizer   freeze_text_layersembed_tokensr   
image_sizer   r   vision_modeluse_resamplerperceiver_configr   r   resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layerslayerscross_layer_intervalgated_cross_attn_layersgradient_checkpointingr   r   norm	post_initfreeze_relevant_params)ri   r   rQ  num_cross_layersrk   rG  r/   re   k  sJ   

	zIdeficsModel.__init__Nc                 C   s>   |d u r| j }|jr| |j |jrt| j|jd d S d S N)rY   )r   rL  freeze_text_module_exceptionsfreeze_vision_layersr[   rO  freeze_vision_module_exceptions)ri   r   r.   r.   r/   r`    s   z#IdeficsModel.freeze_relevant_paramsc                 C   s"   | j | jfD ]}t||d qd S rb  )rZ  r^  r[   )ri   rY   rS   r.   r.   r/   rL    s   zIdeficsModel.freeze_text_layersc                 C   s   t | j|d d S rb  )r[   rO  )ri   rY   r.   r.   r/   rd    s   z!IdeficsModel.freeze_vision_layersFrB   r9   r   r"   inputs_embedsr4   r5   r6   r7   	use_cacheinterpolate_pos_encodingr   rj   r^   c           "      K   s  |dur|j n|j }|du |duA rtd|du r| |}|
r+|du r+t| jd}|j\}}}|dur9| nd}|| }|du rQtj|||jd  |j d}|durv|du rv|	 
dd }||dkd |dd| df }n	|du r|d}tdd	 |||fD d
krtd|dur|j| j|d}|jdd
 \}}| j|| g|jd
d R  }| j||dj}n|dur| \}}}}|j| j|d}||| ||}| jjr|du r| |}|d|d
}}n| \}}}}|}n|du r|d|d
}}ntd|||| |}|	d}|	d}	|	ddd|}	|	|||| }	|dur^| \}}}||f}|	du rXtj||d}	| |	}	nd}	|	dkjddj| jdjdd|}|du rtj||ftj|j d}t| j|||||d}|}t| jD ]0\}} || j  dkr| j!|| j   }!|!|||f|	|dd|}| |f||||d|}q| "|}|||||}t#|||dS )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embedsrG  r   r   )r?   r3   c                 s   s    | ]}|d u V  qd S rP   r.   )rL   r   r.   r.   r/   rT     s    z'IdeficsModel.forward.<locals>.<genexpr>r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)ra   r?   )r4   rh  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer   r   r   )r   rf  r9   r   r"   r   )r7   r(  r"   )r9   r   r"   r   )r!   r%   r"   )$r?   rc   rM  r
   r   r;   get_seq_lengthr*   r:   longcumsummasked_fill_r   sumr>   ra   r   r<   rO  r!   r   rP  rV  r=   r   invert_attention_maskrV   squeezery   r   	enumeraterZ  r[  r\  r^  r    )"ri   rB   r9   r   r"   rf  r4   r5   r6   r7   rg  rh  r   rj   r?   
batch_size
seq_lengthr  past_key_values_lengthseq_length_with_past
num_imagesr%   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaper(  causal_maskr#   idxdecoder_layercross_attn_blockr.   r.   r/   ru     s   

$






"
	


	zIdeficsModel.forwardrP   )NNNNNNNNNNFN)r&   r'   r(   r)   r   re   r`  rL  rd  r   r   r   r*   r  r   r	   r+   ry   r   r   r-   r    ru   r{   r.   r.   rk   r/   rB  b  sd    
2
	
rB  c                $       s>  e Zd ZddiZd! fdd	Zee														d"dejdB d	ej	dB d
ejdB de
dB dejdB dejdB dejdB dejdB dej	dB dejdB dedB dedB dejdB deej	B dee deeB f ddZ									d# fdd	Z	d$dedeeef dedeeef f fdd Z  ZS )%IdeficsForVisionText2Textlm_head.weightmodel.embed_tokens.weightNc                    sT   t  | t|| _t|j|j|jd|jd| _	|jdkr$ddd| _
|   d S )NFr   r   r  z.model.embed_tokens.additional_embedding.weight)r  zlm_head.additional_fc.weight)rd   re   rB  rX   r|   r   rJ  rK  freeze_lm_headlm_head_tied_weights_keysr_  )ri   r   rO  rk   r.   r/   re   U  s   

z"IdeficsForVisionText2Text.__init__Fr   rB   r9   r   r"   rf  r4   r5   r6   r7   labelsrg  rh  r   logits_to_keeprj   r^   c                 K   s   | j d|||||||||	||d|d|}|j}t|tr$t| dn|}| |dd|ddf }d}|
durH| jd||
| jjd|}t	|||j
|j|j|jdS )aC  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```T)rB   r9   r   r"   rf  r4   r5   r6   r7   rg  rh  return_dictr   N)r2   r  rJ  )r1   r2   r"   r#   r$   r%   r.   )rX   r!   rQ   r   slicer  loss_functionr   rJ  r0   r"   r#   r$   r%   )ri   rB   r9   r   r"   rf  r4   r5   r6   r7   r  rg  rh  r   r  rj   outputsr#   slice_indicesr2   r1   r.   r.   r/   ru   i  s>   9z!IdeficsForVisionText2Text.forwardc              
      s   i }|d ur| j jr||d< n	||d< n||d< |dd|d< t j|f||||||
|	d||}|	d urP|d u rP|d jd }|	d d | d f |d	< |S )
Nr6   r5   r4   rh  F)r"   r9   rf  r   r   rg  r7   rB   r   r7   )r   rP  poprd   prepare_inputs_for_generationr;   )ri   rB   r9   r   rf  r"   r   r4   r%   r7   rg  rj   images_kwargsmodel_inputsrr  rk   r.   r/   r    s4   

	
z7IdeficsForVisionText2Text.prepare_inputs_for_generationr  rF   rD   c                    s~   t  j|||fi |}d|v r8|d }|d d dd d f d}|ddr-||d< ntj||gdd|d< |j|d< |S )Nr7   r3   r   rg  Tr   r%   )rd   #_update_model_kwargs_for_generationr   rA   r*   r   r%   )ri   r  rF   rD   rj   r7   	last_maskrk   r.   r/   r    s   

z=IdeficsForVisionText2Text._update_model_kwargs_for_generationrP   )NNNNNNNNNNNFNr   )	NNNNNNNNN)F)r&   r'   r(   r  re   r   r   r*   r  r   r	   r+   ry   r   r   r   r-   r0   ru   r  r   dictrz   r   r  r{   r.   r.   rk   r/   r  R  s    	
]1

r  )r  rB  r*  )r   FNN)r   )r   )Nr)   collections.abcr   dataclassesr   typingr   r*   torch.nn.functionalr   r   rm    r   r0  activationsr   cache_utilsr	   r
   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   configuration_ideficsr   	perceiverr   visionr   r   
get_loggerr&   r   r    r0   rH   r[   rK   r\   rJ   r|   Moduler   r   r   r   r   r   r   r   r   r  r  r*  rB  r  __all__r.   r.   r.   r/   <module>   s   

-i<'

 81 p <