o
    i                     @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
  mZ ddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/m0Z0 e$1e2Z3ee"ddG dd deZ4ee"ddG dd deZ5				dGddZ6g fd d!Z7G d"d# d#e
j8Z9G d$d% d%e
j:Z;G d&d' d'e
j<Z=G d(d) d)ej
j<Z>d*d+ Z?dHd,d-Z@G d.d/ d/e
j<ZA	0dId1e
j<d2ejBd3ejBd4ejBd5eejB d6eCd7eCfd8d9ZDG d:d; d;e
j<ZEG d<d= d=eZFG d>d? d?eZGe"G d@dA dAeZHe"G dBdC dCeHZIG dDdE dEeHeZJg dFZKdS )JzPyTorch Idefics model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)create_causal_mask)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)OutputRecordercheck_model_inputs   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformerz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )custom_introc                   @   sv   e Zd ZU dZdZeej ed< dZ	ee
 ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )IdeficsBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r"   r   torchFloatTensor__annotations__r#   r
   r$   tupler%   r&    r/   r/   `/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/idefics/modeling_idefics.pyr!   0   s   
 r!   zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
ee ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	IdeficsCausalLMOutputWithPastae  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr#   r$   r%   r&   )r'   r(   r)   r*   r2   r   r+   r,   r-   r3   r#   r
   r$   r.   r%   r&   r/   r/   r/   r0   r1   Q   s   
 r1   Fc                 K   sB  t | jd ddd|d| j}| d|} |d|d< |d|d< |d|d< |d|d< d|v rI|d }|d||d< |d urU|d||d	< |d d ure|d d||d< |d d ury|d d||d< | |fS |d d ur|d d||d< | |fS |d d ur|d d||d< | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r+   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderr:   encoder_outputsmodel_kwargsexpanded_return_idxr9   r/   r/   r0   expand_inputs_for_generationq   s:   ,		rI   c                    sf   t jt jt jd  fdd|D }|  D ]|r+tfdd|D r+d qd q| S )N)	LayerNormLinear	Embeddingc                    s   g | ]} | qS r/   r/   ).0m)mappingr/   r0   
<listcomp>   s    z freeze_model.<locals>.<listcomp>c                 3   s    | ]}t  |V  qd S N)
isinstance)rM   t)moduler/   r0   	<genexpr>   s    zfreeze_model.<locals>.<genexpr>TF)r   rJ   rK   rL   modulesanyrequires_grad_)modelmodule_exceptionsmodule_exceptions_mappedr/   )rO   rT   r0   freeze_model   s   r\   c                       sN   e Zd ZdZ				ddee ddf fddZdd	 Zdefd
dZ	  Z
S )IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    FNpartially_freezereturnc           	         s   |dur||krt d| d| t jd|||||d| || _|| _|| _|| _|r5| jd | jdkrGt	j
| j|||d| _dS dS )	a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimr@   dtypepadding_idxFr   )r`   ra   r@   rb   r/   )
ValueErrorsuper__init__r`   rc   num_additional_embeddingsr^   weightrX   r   rL   additional_embedding)	selfr`   rg   ra   r^   r@   rb   rc   kwargs	__class__r/   r0   rf      s2   
z"IdeficsDecoupledEmbedding.__init__c                 C   sj   | j dkrt|| jS | }t|| jk}|| }| || j }d||< t|| j}|||< |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	rg   F	embeddingrh   cloner+   wherer`   ri   )rj   rC   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectorr/   r/   r0   forward   s   
z!IdeficsDecoupledEmbedding.forwardc                 C   s$   d| j  d| j d| j d| j S )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)r`   rg   ra   r^   rj   r/   r/   r0   
extra_repr  s   $z$IdeficsDecoupledEmbedding.extra_repr)FNNN)r'   r(   r)   r*   r   boolrf   rv   strry   __classcell__r/   r/   rl   r0   r]      s    
5'r]   c                       sj   e Zd ZdZ					ddedededed	ed
df fddZdejd
ejfddZ	d
e
fddZ  ZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    r   TNin_featuresout_featuresout_additional_featuresbiasr^   r_   c                    sr   t  ||||| || _|| _|| _|| _|r&| jd |r&| jd |dkr7t	j
|||||d| _dS dS )aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )r~   r   r   r@   rb   N)re   rf   r   r^   r~   r   rh   rX   r   r   rK   additional_fc)rj   r~   r   r   r   r^   r@   rb   rl   r/   r0   rf     s$   zIdeficsDecoupledLinear.__init__inputc                 C   s:   t || j| j}| jdkr| |}t||fd}|S )Nr   r4   )rn   linearrh   r   r   r   r+   cat)rj   r   outputadditional_featuresr/   r/   r0   rv   C  s
   

zIdeficsDecoupledLinear.forwardc              
   C   s0   d| j  d| j d| j d| jdu d| j 
S )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nrw   r~   r   r   r   r^   rx   r/   r/   r0   ry   L  s   0z!IdeficsDecoupledLinear.extra_repr)r   TTNN)r'   r(   r)   r*   intrz   rf   r+   Tensorrv   r{   ry   r|   r/   r/   rl   r0   r}     s,    	$	r}   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	IdeficsRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)re   rf   r   	Parameterr+   onesrh   variance_epsilon)rj   hidden_sizeepsrl   r/   r0   rf   S  s   

zIdeficsRMSNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   r4   T)keepdim)r?   r+   float32powmeanrsqrtr   rh   rb   float16bfloat16)rj   r$   variancer/   r/   r0   rv   [  s
   
zIdeficsRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r.   rh   r<   r   rx   r/   r/   r0   ry   e  s   zIdeficsRMSNorm.extra_repr)r   )r'   r(   r)   rf   rv   ry   r|   r/   r/   rl   r0   r   R  s    
r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )IdeficsEmbedding   '  Nc                    sz   t    || _|| _|| _d| jtjd| jdtjdj|tj	d| j   }| j
d|dd | j|| jjt d	 d S )
N      ?r   r   rb   r@   rb   inv_freqF
persistentseq_lenr@   rb   )re   rf   dimmax_position_embeddingsbaser+   r;   int64r?   floatregister_buffer_set_cos_sin_cacher   r@   get_default_dtype)rj   r   r   r   r@   r   rl   r/   r0   rf   k  s   
&
zIdeficsEmbedding.__init__c                 C   s|   || _ tj| j |tjd| j}td|| j}tj||fdd}| jd|	 
|dd | jd| 
|dd d S )	Nr   zi,j->ijr4   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr+   r;   r   type_asr   einsumr   r   cosr?   sin)rj   r   r@   rb   rS   freqsembr/   r/   r0   r   |  s   z#IdeficsEmbedding._set_cos_sin_cachec                 C   sN   || j kr| j||j|jd | jd | j|jd| jd | j|jdfS )Nr   r   )r   r   r@   rb   r   r?   r   )rj   xr   r/   r/   r0   rv     s
   
zIdeficsEmbedding.forward)r   r   NrQ   )r'   r(   r)   rf   r   rv   r|   r/   r/   rl   r0   r   j  s    
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr4   r   r   )r<   r+   r   )r   x1x2r/   r/   r0   rotate_half  s   r   c                 C   sL   ||  |}||  |}| | t| |  }|| t||  }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embedr/   r/   r0   apply_rotary_pos_emb  s
   r   c                       s2   e Zd Zdededef fddZdd Z  ZS )
IdeficsMLPr   intermediate_size
hidden_actc                    sN   t    tj||dd| _tj||dd| _tj||dd| _t| | _d S )NFr   )	re   rf   r   rK   	gate_proj	down_projup_projr	   act_fn)rj   r   r   r   rl   r/   r0   rf     s
   
zIdeficsMLP.__init__c                 C   s    |  | | || | S rQ   )r   r   r   r   )rj   r   r/   r/   r0   rv         zIdeficsMLP.forward)r'   r(   r)   r   r{   rf   rv   r|   r/   r/   rl   r0   r     s    r           rT   querykeyvaluer:   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr4   )r   rb   ptrainingr   r   )r+   matmul	transposer   
functionalsoftmaxr   r?   rb   r   r   
contiguous)
rT   r   r   r   r:   r   r   rk   attn_weightsattn_outputr/   r/   r0   eager_attention_forward  s   
r   c                       s   e Zd ZdZ					d dedededed	ee d
edee f fddZ	de
jdedefddZedddd					d!de
jdee
j dee
j dee
j dee dee
j dee dee
je
jf fddZ  ZS )"IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FNr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc           	         s  t    || _|| _|| _|| | _|| _d| _| jd | _|| _	|d u r1t
d| jj d | j| | jkrEtd| j d| d|| _ttjdsRtd	| jrt|jd
s^| jn|jj}tj| j|| j dd| _tj||| j dd| _tj||| j dd| _n'tj| j|| j dd| _tj| j|| j dd| _tj| j|| j dd| _tj|| j |dd| _t| j| _|| _| jrt| j|jd| _t| j|jd| _ d S d S )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!re   rf   r   r   r   head_dimr   	is_causalr   r   loggerwarning_oncerm   r'   rd   r   hasattrr   r   vision_configr   rK   q_projk_projv_projo_projr   
rotary_embr   r   rms_norm_epsq_layer_normk_layer_norm)	rj   r   r   r   r   r   r   r   kv_input_dimrl   r/   r0   rf     s|   




zIdeficsAttention.__init__tensorr   bszc                 C   s    | ||| j| jdd S )Nr   r   )r=   r   r   r   r   )rj   r   r   r   r/   r/   r0   _shape3  r   zIdeficsAttention._shapepast_key_valuer#   4.58new_nameversionr$   key_value_statesr:   r   cache_positionrk   r_   c                 K   s  | j p|d u}| \}	}
}| ||	|
| j| jdd}|sD| ||	|
| j| jdd}| ||	|
| j| jdd}n)| \}}}| ||	|| j| jdd}| ||	|| j| jdd}|j	d }|d ur|||d 7 }|s| j
|t||
d\}}t|||||\}}|d urd|i}|||| j|\}}| jr| |}| |}t}| jjdkrt| jj }|| ||||f| jsdn| j| jd	|\}}||	|
d
 }| |}||fS )Nr   r   r   r   )r   r   eagerr   )r   r   r4   )r   sizer   r=   r   r   r   r   r   r<   r   maxr   updater   r   r   r   r   r   _attn_implementationr   r   r   r   reshaper   r   )rj   r$   r   r:   r   r#   r   rk   r   r   q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer   r   r/   r/   r0   rv   6  sP   ""$" 




zIdeficsAttention.forward)r   FNFNNNNNN)r'   r(   r)   r*   r   r   rz   r   r   rf   r+   r   r   r   
LongTensorr
   r   r   r.   rv   r|   r/   r/   rl   r0   r     s\    Q	r   c                       s   e Zd Zddedee f fddZedddd	e				dd
e	j
dee	j
 dee	j dee dee	j dee de	jfddZ  ZS )IdeficsDecoderLayerNr   r   c                    sr   t    |j| _t| j|j|j||d| _t| j|j|j	d| _
t|j|jd| _t|j|jd| _|j| _d S )N)r   r   r   r   r   r   r   r   r   )re   rf   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r   input_layernormpost_attention_layernormrj   r   r   rl   r/   r0   rf   {  s"   
zIdeficsDecoderLayer.__init__r   r#   r   r   r$   r:   r   r   rk   r_   c           	      K   s   |}|  |}| jd|||||d|\}}tjj|| j| jd}|| }|}| |}| |}tjj|| j| jd}|| }|S )N)r$   r:   r   r#   r   r   r/   )r  r  r   r   r   r   r  r  )	rj   r$   r:   r   r#   r   rk   residualr  r/   r/   r0   rv     s&   



zIdeficsDecoderLayer.forwardrQ   )NNNN)r'   r(   r)   r   r   r   rf   r   r   r+   r   r  r
   r   r   r,   rv   r|   r/   r/   rl   r0   r  z  s.    r  c                       s   e Zd Zddedee f fddZedddd	e					dd
e	j
dee	j
 dee	j
 dee	j
 dee	j
 dee dee de	jfddZ  ZS )IdeficsGatedCrossAttentionLayerNr   r   c              	      s~  t    |j| _t| j|jd|j||j|d| _t| j|j	|j
d| _t|j|jd| _t|j|jd| _|j| _t | _t | _|jdkr|jdkrgttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _ntd	|j d
|jdkr|jdkrttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _njtd	|j d
|jdv r$|jdkrttjd|jdd| jfd| _ttjd|jdd| jfd| _n3|jdkrttjd|jdd| _ttjd|jdd| _ntd	|j d
t d|j dt!| dr9t!| ds=tdd S )NT)r   r   r   r   r   r   r   r  r   zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr   )r   stdr  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"re   rf   r   r   r  r   r   
cross_attnr   r   r   r  r   r   r  r  r   r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r+   r  r#  r$  rd   r   r  alphas_initializer_rangeNotImplementedErrorr   r  rl   r/   r0   rf     sn   
	








z(IdeficsGatedCrossAttentionLayer.__init__r   r#   r   r   r$   r:   r&   r8   cross_attention_gaterk   r_   c           
      K   s   |du rt d|du rt d|durtd|}| |}| jd	|||d|\}}	tjj|| j| jd}|	|dkdddddf d}|| 
| j|  }|}| |}| |}tjj|| j| jd}|| | j|  }|S )
a  
        image_hidden_states (`torch.FloatTensor`):
            Input to the layer of shape `(batch, seq_len, embed_dim)`
        image_attention_mask (`torch.FloatTensor`, *optional*):
            image attention mask of size
            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        cross_attention_gate (`torch.FloatTensor`, *optional*):
            gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r$   r   r:   r   r   r   r/   )rd   r,  r  r%  r   r   r   r   r   masked_fillr'  r#  r  r  r(  r$  )
rj   r$   r:   r&   r8   r-  r#   rk   r  r  r/   r/   r0   rv     s8   

"

z'IdeficsGatedCrossAttentionLayer.forwardrQ   r  )r'   r(   r)   r   r   r   rf   r   r   r+   r   r
   r   r   r,   rv   r|   r/   r/   rl   r0   r    s4    B	r  c                   @   sR   e Zd ZU eed< dZdZddgZdZdZ	dZ
dZeeeddd	d
Zdd ZdS )IdeficsPreTrainedModelr   rY   Tr  r  Fr   r  )index
layer_name)r$   r%   c                 C   s  | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|tjr[|jjd |j	j
  d S t|tri|jjd d S t|trv|jj  d S t|tr| j jdkr|jj
  |jj
  d S | j jdkr|jjd |jjd d S | j jdv r|jjjd| j jd |jjjd| j jd d S d S t|tr|jj  d S d S )Nr   )r   r"  r   r  r   >   r  r   r!  )r   initializer_rangerR   r   rK   Conv2drh   datanormal_r   zero_rL   rc   rJ   fill_r   r   class_embeddingr  r)  r#  r$  r+  r   latents)rj   rT   r"  r/   r/   r0   _init_weightsD  s@   





z$IdeficsPreTrainedModel._init_weightsN)r'   r(   r)   r   r-   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_can_compile_fullgraph_supports_attention_backendr  r   r   _can_record_outputsr:  r/   r/   r/   r0   r/  3  s   
 r/  c                !       s   e Zd ZdZdef fddZdddZg fdd	Zg fd
dZe	e
												ddeej deej deej dee deej deej deej deej deej dee dee deej dee deeef fddZ  ZS )IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c                    s  t     | _ j| _ j| _t j j j j	| jd| _
 jj| _ j| _ j| j_t j| _ jrJ j}t  jj|j|j|j|j| _t fddt jD | _ j| _ j| j }t fddt|D | _d| _ t! j j"d| _#| $  | %  d S )N)r`   rg   ra   r^   rc   c                       g | ]}t  |d qS )r   )r  rM   ir   r/   r0   rP         z)IdeficsModel.__init__.<locals>.<listcomp>c                    rD  rE  )r  rF  rH  r/   r0   rP     rI  Fr   )&re   rf   r   pad_token_idrc   
vocab_sizer]   additional_vocab_sizer   freeze_text_layersembed_tokensr   
image_sizer  r   vision_modeluse_resamplerperceiver_configr   r   resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layerslayerscross_layer_intervalgated_cross_attn_layersgradient_checkpointingr   r   norm	post_initfreeze_relevant_params)rj   r   rR  num_cross_layersrl   rH  r0   rf   o  sJ   

	zIdeficsModel.__init__Nc                 C   s>   |d u r| j }|jr| |j |jrt| j|jd d S d S N)rZ   )r   rM  freeze_text_module_exceptionsfreeze_vision_layersr\   rP  freeze_vision_module_exceptions)rj   r   r/   r/   r0   ra    s   z#IdeficsModel.freeze_relevant_paramsc                 C   s"   | j | jfD ]}t||d qd S rc  )r[  r_  r\   )rj   rZ   rT   r/   r/   r0   rM    s   zIdeficsModel.freeze_text_layersc                 C   s   t | j|d d S rc  )r\   rP  )rj   rZ   r/   r/   r0   re    s   z!IdeficsModel.freeze_vision_layersFrC   r:   r   r#   inputs_embedsr5   r6   r7   r8   	use_cacheinterpolate_pos_encodingr   rk   r_   c           "      K   s  |dur|j n|j }|du |duA rtd|du r| |}|
r+|du r+t| jd}|j\}}}|dur9| nd}|| }|du rQtj|||jd  |j d}|durv|du rv|	 
dd }||dkd |dd| df }n	|du r|d}tdd	 |||fD d
krtd|dur|j| j|d}|jdd
 \}}| j|| g|jd
d R  }| j||dj}n|dur| \}}}}|j| j|d}||| ||}| jjr|du r| |}|d|d
}}n| \}}}}|}n|du r|d|d
}}ntd|||| |}|	d}|	d}	|	ddd|}	|	|||| }	|dur^| \}}}||f}|	du rXtj||d}	| |	}	nd}	|	dkjddj| jdjdd|}|du rtj||ftj|j d}t| j|||||d}|}t| jD ]0\}} || j  dkr| j!|| j   }!|!|||f|	|dd|}| |f||||d|}q| "|}|||||}t#|||dS )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embedsrH  r   r   )r@   r4   c                 s   s    | ]}|d u V  qd S rQ   r/   )rM   r   r/   r/   r0   rU     s    z'IdeficsModel.forward.<locals>.<genexpr>r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)rb   r@   )r5   ri  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer   r   r   )r   input_embedsr:   r   r#   r   )r8   r-  r#   )r:   r   r#   r   )r"   r&   r#   )$r@   rd   rN  r   r   r<   get_seq_lengthr+   r;   longcumsummasked_fill_r   sumr?   rb   r   r=   rP  r"   r  rQ  rW  r>   r   invert_attention_maskrW   squeezerz   r   	enumerater[  r\  r]  r_  r!   )"rj   rC   r:   r   r#   rg  r5   r6   r7   r8   rh  ri  r   rk   r@   
batch_size
seq_lengthr  past_key_values_lengthseq_length_with_past
num_imagesr&   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaper-  causal_maskr$   idxdecoder_layercross_attn_blockr/   r/   r0   rv     s   

$






"
	


	zIdeficsModel.forwardrQ   )NNNNNNNNNNFN)r'   r(   r)   r*   r   rf   ra  rM  re  r   r   r   r+   r  r   r
   r,   rz   r   r   r   r.   r!   rv   r|   r/   r/   rl   r0   rC  f  sb    
2
	

rC  c                #       s>  e Zd ZddgZd! fdd	Zdd Zee													d"d	ee	j
 d
ee	j dee	j
 dee dee	j dee	j dee	j dee	j dee	j dee	j
 dee dee dee	j
 dee deeef fddZ									d# fdd	Z	d$dedeeef dedeeef f fdd Z  ZS )%IdeficsForVisionText2Textzmodel.embed_tokens.weightzlm_head.weightNc                    s>   t  | t|| _t|j|j|jd|jd| _	| 
  d S )NFr   )re   rf   rC  rY   r}   r   rK  rL  freeze_lm_headlm_headr`  )rj   r   rP  rl   r/   r0   rf   X  s   
	z"IdeficsForVisionText2Text.__init__c                 C   s   |   }|  }t| jddr&|j|_|jdkr&|j|jks J |jj|j_t	|drDt	|drF|j
|_t	|drHt	|drJ|j|_dS dS dS dS dS )	z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
        tie_word_embeddingsTr   r   r`   r   rg   N)get_output_embeddingsget_input_embeddingsgetattrr   rh   rg   r   ri   r   r   r`   r   )rj   output_embeddingsinput_embeddingsr/   r/   r0   tie_weightsg  s   
z%IdeficsForVisionText2Text.tie_weightsFrC   r:   r   r#   rg  r5   r6   r7   r8   labelsrh  ri  r   rk   r_   c                 K   s   | j d|||||||||	||d|d|}|d }| |}d}|
dur3| jd||
| jjd|}t|||j|j|j|j	dS )aC  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```T)rC   r:   r   r#   rg  r5   r6   r7   r8   rh  ri  return_dictr   r   N)r3   r  rK  )r2   r3   r#   r$   r%   r&   r/   )
rY   r  loss_functionr   rK  r1   r#   r$   r%   r&   )rj   rC   r:   r   r#   rg  r5   r6   r7   r8   r  rh  ri  r   rk   outputsr$   r3   r2   r/   r/   r0   rv   |  s<   9
z!IdeficsForVisionText2Text.forwardc              
      s   i }|d ur| j jr||d< n	||d< n||d< |dd|d< t j|f||||||
|	d||}|	d urP|d u rP|d jd }|	d d | d f |d	< |S )
Nr7   r6   r5   ri  F)r#   r:   rg  r   r   rh  r8   rC   r   r8   )r   rQ  popre   prepare_inputs_for_generationr<   )rj   rC   r:   r   rg  r#   r   r5   r&   r8   rh  rk   images_kwargsmodel_inputsrt  rl   r/   r0   r    s4   

	
z7IdeficsForVisionText2Text.prepare_inputs_for_generationr  rG   rE   c                    s~   t  j|||fi |}d|v r8|d }|d d dd d f d}|ddr-||d< ntj||gdd|d< |j|d< |S )Nr8   r4   r   rh  Tr   r&   )re   #_update_model_kwargs_for_generationr   rB   r+   r   r&   )rj   r  rG   rE   rk   r8   	last_maskrl   r/   r0   r    s   

z=IdeficsForVisionText2Text._update_model_kwargs_for_generationrQ   )NNNNNNNNNNNFN)	NNNNNNNNN)F)r'   r(   r)   _tied_weights_keysrf   r  r   r   r   r+   r  r   r
   r,   rz   r   r   r   r.   r1   rv   r  r   dictr{   r   r  r|   r/   r/   rl   r0   r  U  s    	

[1

r  )r  rC  r/  )r   FNN)r   )r   )Lr*   dataclassesr   typingr   r   r   r   r+   torch.nn.functionalr   r   rn   activationsr	   cache_utilsr
   r   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr   r   configuration_ideficsr   	perceiverr   visionr   r   
get_loggerr'   r   r!   r1   rI   r\   rL   r]   rK   r}   Moduler   r   r   r   r   r   r   r   r   r  r  r/  rC  r  __all__r/   r/   r/   r0   <module>   s   

-i<'

 9 2 o J