o
    wiv                    @   s  d Z ddlmZ ddlmZmZmZmZ ddlZddl	m
  mZ ddlZddlm
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ e' rddl0m1Z1 ddl2m3Z3 e(4e5Z6ee%ddG dd deZ7ee%ddG dd deZ8				dJdd Z9g fd!d"Z:G d#d$ d$e
j;Z<G d%d& d&e
j=Z>G d'd( d(e
j?Z@G d)d* d*ej
j?ZAd+d, ZBdKd-d.ZCG d/d0 d0e
j?ZD	1dLd2e
j?d3ejEd4ejEd5ejEd6eejE d7eFd8eFfd9d:ZGG d;d< d<e
j?ZHG d=d> d>eZIG d?d@ d@eZJe%G dAdB dBe ZKG dCdD dDee$ZLe%G dEdF dFeKZMG dGdH dHeKeZNg dIZOdS )MzPyTorch Idefics model.    )	dataclass)AnyCallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)GradientCheckpointingLayer)ModelOutput)ALL_ATTENTION_FUNCTIONSPretrainedConfigPreTrainedModel)Unpack)
LossKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )IdeficsConfig)IdeficsPerceiverResampler)IdeficsVisionEmbeddingsIdeficsVisionTransformer)	BlockMask)make_flex_block_causal_maskz{
    Base class for Idefics model's outputs that may also contain a past key/values (to speed up sequential decoding).
    )custom_introc                   @   s   e Zd ZU dZdZeej ed< dZ	ee
e
ej   ed< dZee
ej  ed< dZee
ej  ed< dZee
ej  ed< dS )IdeficsBaseModelOutputWithPasta  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Sequence of hidden-states at the output of the last layer of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
        `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
        encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r#   r   torchFloatTensor__annotations__r$   tupler%   r&   r'    r0   r0   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/idefics/modeling_idefics.pyr"   6   s   
 r"   zS
    Base class for Idefics causal language model (or autoregressive) outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeej  ed< dZeeej  ed< dZeeej  ed< dZeeej  ed< dS )	IdeficsCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
        `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
        sequence_length, hidden_size)`.

        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr$   r%   r&   r'   )r(   r)   r*   r+   r3   r   r,   r-   r.   r4   r$   listr%   r/   r&   r'   r0   r0   r0   r1   r2   Z   s   
 r2   Fc                 K   sJ  t | jd ddd|d| j}| d|} |dd |d< |dd |d< |dd |d< |dd |d< d|v rM|d }|d||d< |d urY|d||d	< |d d uri|d d||d< |d d ur}|d d||d< | |fS |d d ur|d d||d< | |fS |d d ur|d d||d< | |fS )
Nr   r   pixel_valuesimage_encoder_embeddingsperceiver_embeddingsimage_attention_masktoken_type_idsattention_mask)	r,   arangeshapeviewrepeattodeviceindex_selectget)	input_idsexpand_sizeis_encoder_decoderr<   encoder_outputsmodel_kwargsexpanded_return_idxr;   r0   r0   r1   expand_inputs_for_generation{   s:   ,		rK   c                    sf   t jt jt jd  fdd|D }|  D ]|r+tfdd|D r+d qd q| S )N)	LayerNormLinear	Embeddingc                    s   g | ]} | qS r0   r0   ).0m)mappingr0   r1   
<listcomp>       z freeze_model.<locals>.<listcomp>c                 3   s    | ]}t  |V  qd S N)
isinstance)rO   t)moduler0   r1   	<genexpr>   s    zfreeze_model.<locals>.<genexpr>TF)r   rL   rM   rN   modulesanyrequires_grad_)modelmodule_exceptionsmodule_exceptions_mappedr0   )rQ   rW   r1   freeze_model   s   r_   c                       sN   e Zd ZdZ				ddee ddf fddZdd	 Zdefd
dZ	  Z
S )IdeficsDecoupledEmbeddinga  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the embeddings. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `num_additional_embeddings` > 0,
    then it will create `num_additional_embeddings` additional parameters that are always trained. If
    `num_additional_embeddings=0`, then the module defaults back to the regular behavior of `nn.Embedding`.
    FNpartially_freezereturnc           	         s   |dur||krt d| d| t jd|||||d| || _|| _|| _|| _|r5| jd | jdkrGt	j
| j|||d| _dS dS )	a)  
        Args:
            num_embeddings (`int`):
                Size of the dictionary of embeddings
            num_additional_embeddings (`int`):
                Number of additional embeddings. Only useful when you `partially_freeze=True`.
            embedding_dim (`int`):
                The size of each embedding vector
            partially_freeze: (`bool`, *optional*, defaults to `False`):
                If `True`, the regular `weight` will be frozen. `additional_weight` is never frozen.
            padding_idx (`int`, *optional*):
                The padding index (needs to be less than num_embeddings)

        Note: there are a lot of other parameters to initialize a standard `nn.Embedding` such as `padding_idx`,
        `max_norm` or `norm_type`. We are not supporting these.
        Nz/padding_idx must be within num_embeddings. Got z and )num_embeddingsembedding_dimrB   dtypepadding_idxFr   )rc   rd   rB   re   r0   )
ValueErrorsuper__init__rc   rf   num_additional_embeddingsra   weightr[   r   rN   additional_embedding)	selfrc   rj   rd   ra   rB   re   rf   kwargs	__class__r0   r1   ri      s2   
z"IdeficsDecoupledEmbedding.__init__c                 C   sj   | j dkrt|| jS | }t|| jk}|| }| || j }d||< t|| j}|||< |S )a  
        we have 2 embeddings, with different indices - one pretrained self.weight and another
        self.additional_embedding.weight that is being trained.

        in order to make a lookup of the input ids, we:
        1. find out the indices of the entries belonging to the 2nd embedding
        2. extract those values while subtracting the size of the first embedding (num_embeddings), since the 2nd
           embedding starts from 0 and not num_embeddings
        3. perform the 2nd embedding lookup
        4. now we handle the 1st embedding, we overwrite indices belonging to the 2nd embedding with a padding index
        5. perform the 1st embedding lookup
        6. now we overwrite the values in the 1st embedding lookup with the values of the 2nd embedding lookup

        note: for the 1st embedding lookup we could have looked up only the low indices and not do the padding, but
        then we have to create a new tensor and populate it with 2 tensors that are spread out across various indices -
        i.e. not a simple concat - I haven't benchmarked the complex case if it's any faster, given that seqlens are
        usually relatively short it's probably not faster or if faster not by much - but might be a good idea to
        measure.

        r   )	rj   F	embeddingrk   cloner,   whererc   rl   )rm   rE   additional_vocab_indicesinput_ids_additional_vocabadditional_embeddingsfull_vectorr0   r0   r1   forward   s   
z!IdeficsDecoupledEmbedding.forwardc                 C   s$   d| j  d| j d| j d| j S )Nznum_embeddings=z, num_additional_embeddings=z, embedding_dim=, partially_freeze=)rc   rj   rd   ra   rm   r0   r0   r1   
extra_repr  s   $z$IdeficsDecoupledEmbedding.extra_repr)FNNN)r(   r)   r*   r+   r   boolri   ry   strr|   __classcell__r0   r0   ro   r1   r`      s    
5'r`   c                       sj   e Zd ZdZ					ddedededed	ed
df fddZdejd
ejfddZ	d
e
fddZ  ZS )IdeficsDecoupledLineara  
    Implements a decoupling of parameters to allow freezing (or not) a subset of the parameters. In practise, the
    regular `weight` can be trained or frozen (i.e. `partially_freeze=True`), and if `out_additional_features` > 0,
    then it will create `out_additional_features * in_features` additional parameters that are always trained. If
    `out_additional_features=0`, then the module defaults back to the regular behavior of `nn.Linear`.
    r   TNin_featuresout_featuresout_additional_featuresbiasra   rb   c                    sr   t  ||||| || _|| _|| _|| _|r&| jd |r&| jd |dkr7t	j
|||||d| _dS dS )aG  
        out_additional_features: int. Number of additional trainable dimensions. Only makes sense when
        `partially_freeze=True`. partially_freeze: bool. If True, the regular `weight` will be frozen and extra
        parameters (if any) will be trainable. If False, default to the regular behavior of nn.Linear.
        Fr   )r   r   r   rB   re   N)rh   ri   r   ra   r   r   rk   r[   r   r   rM   additional_fc)rm   r   r   r   r   ra   rB   re   ro   r0   r1   ri   )  s$   zIdeficsDecoupledLinear.__init__inputc                 C   s:   t || j| j}| jdkr| |}t||fd}|S )Nr   r6   )rq   linearrk   r   r   r   r,   cat)rm   r   outputadditional_featuresr0   r0   r1   ry   M  s
   

zIdeficsDecoupledLinear.forwardc              
   C   s0   d| j  d| j d| j d| jdu d| j 
S )z=Overwriting `nn.Linear.extra_repr` to include new parameters.zin_features=z, out_features=z, out_additional_features=z, bias=Nrz   r   r   r   r   ra   r{   r0   r0   r1   r|   V  s   0z!IdeficsDecoupledLinear.extra_repr)r   TTNN)r(   r)   r*   r+   intr}   ri   r,   Tensorry   r~   r|   r   r0   r0   ro   r1   r      s,    	$	r   c                       s.   e Zd Zd fdd	Zdd Zdd Z  ZS )	IdeficsRMSNormư>c                    s&   t    tt|| _|| _dS )z=
        IdeficsRMSNorm is equivalent to T5LayerNorm
        N)rh   ri   r   	Parameterr,   onesrk   variance_epsilon)rm   hidden_sizeepsro   r0   r1   ri   ]  s   

zIdeficsRMSNorm.__init__c                 C   s\   | tjdjddd}|t|| j  }| jjtj	tj
fv r)| | jj}| j| S )N   r6   T)keepdim)rA   r,   float32powmeanrsqrtr   rk   re   float16bfloat16)rm   r%   variancer0   r0   r1   ry   e  s
   
zIdeficsRMSNorm.forwardc                 C   s   t | jj d| j S )Nz, eps=)r/   rk   r>   r   r{   r0   r0   r1   r|   o  s   zIdeficsRMSNorm.extra_repr)r   )r(   r)   r*   ri   ry   r|   r   r0   r0   ro   r1   r   \  s    
r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )IdeficsEmbedding   '  Nc                    sz   t    || _|| _|| _d| jtjd| jdtjdj|tj	d| j   }| j
d|dd | j|| jjt d	 d S )
N      ?r   r   re   rB   re   inv_freqF
persistentseq_lenrB   re   )rh   ri   dimmax_position_embeddingsbaser,   r=   int64rA   floatregister_buffer_set_cos_sin_cacher   rB   get_default_dtype)rm   r   r   r   rB   r   ro   r0   r1   ri   u  s   
&
zIdeficsEmbedding.__init__c                 C   s|   || _ tj| j |tjd| j}td|| j}tj||fdd}| jd|	 
|dd | jd| 
|dd d S )	Nr   zi,j->ijr6   r   
cos_cachedFr   
sin_cached)max_seq_len_cachedr,   r=   r   type_asr   einsumr   r   cosrA   sin)rm   r   rB   re   rV   freqsembr0   r0   r1   r     s   z#IdeficsEmbedding._set_cos_sin_cachec                 C   sN   || j kr| j||j|jd | jd | j|jd| jd | j|jdfS )Nr   r   )r   r   rB   re   r   rA   r   )rm   xr   r0   r0   r1   ry     s
   
zIdeficsEmbedding.forward)r   r   NrT   )r(   r)   r*   ri   r   ry   r   r0   r0   ro   r1   r   t  s    
r   c                 C   sH   | dd| j d d f }| d| j d d df }tj| |fddS )z*Rotates half the hidden dims of the input..Nr6   r   r   )r>   r,   r   )r   x1x2r0   r0   r1   rotate_half  s   r   c                 C   sL   ||  |}||  |}| | t| |  }|| t||  }||fS )an  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`):
            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
            used to pass offsetted position ids when working with a KV-cache.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   position_idsunsqueeze_dimq_embedk_embedr0   r0   r1   apply_rotary_pos_emb  s
   r   c                       s2   e Zd Zdededef fddZdd Z  ZS )
IdeficsMLPr   intermediate_size
hidden_actc                    sN   t    tj||dd| _tj||dd| _tj||dd| _t| | _d S )NFr   )	rh   ri   r   rM   	gate_proj	down_projup_projr	   act_fn)rm   r   r   r   ro   r0   r1   ri     s
   
zIdeficsMLP.__init__c                 C   s    |  | | || | S rT   )r   r   r   r   )rm   r   r0   r0   r1   ry         zIdeficsMLP.forward)r(   r)   r*   r   r~   ri   ry   r   r0   r0   ro   r1   r     s    r           rW   querykeyvaluer<   scalingdropoutc           
      K   s|   t ||dd| }|d ur|| }tjj|dt jd|j}tjj	||| j
d}t ||}	|	dd }	|	|fS )Nr6   )r   re   ptrainingr   r   )r,   matmul	transposer   
functionalsoftmaxr   rA   re   r   r   
contiguous)
rW   r   r   r   r<   r   r   rn   attn_weightsattn_outputr0   r0   r1   eager_attention_forward  s   
r   c                       s   e Zd ZdZ					ddedededed	ed
edee f fddZ	de
jdedefddZ							dde
jdee
j dee
j dee
j deee
j  dededee
j dee
jee
j eee
j  f fddZ  ZS ) IdeficsAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr   FNr   	num_headsr   is_cross_attentionconfigqk_layer_norms	layer_idxc           	         s  t    || _|| _|| _|| | _|| _d| _| jd | _|| _	|d u r1t
d| jj d | j| | jkrEtd| j d| d|| _ttjdsRtd	| jrt|jd
s^| jn|jj}tj| j|| j dd| _tj||| j dd| _tj||| j dd| _n'tj| j|| j dd| _tj| j|| j dd| _tj| j|| j dd| _tj|| j |dd| _t| j| _|| _| jrt| j|jd| _t| j|jd| _ d S d S )NTg      zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.z?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).scaled_dot_product_attentionz)this model requires pytorch 2.0 or higher	embed_dimFr   r   )!rh   ri   r   r   r   head_dimr   	is_causalr   r   loggerwarning_oncerp   r(   rg   r   hasattrr   r   vision_configr   rM   q_projk_projv_projo_projr   
rotary_embr   r   rms_norm_epsq_layer_normk_layer_norm)	rm   r   r   r   r   r   r   r   kv_input_dimro   r0   r1   ri     s|   




zIdeficsAttention.__init__tensorr   bszc                 C   s    | ||| j| jdd S )Nr   r   )r?   r   r   r   r   )rm   r   r   r   r0   r0   r1   _shape=  r   zIdeficsAttention._shaper%   key_value_statesr<   r   past_key_valueoutput_attentions	use_cachecache_positionrb   c	                 K   s  | j p|d u}
| \}}}| |||| j| jdd}|
sD| |||| j| jdd}| |||| j| jdd}n)| \}}}| |||| j| jdd}| |||| j| jdd}|j	d }|d ur|||d 7 }|
s| j
|t||d\}}t|||||\}}|d urd|i}|||| j|\}}| jr| |}| |}t}| jjdkr| jjdkr|rtd	 nt| jj }|| ||||f| jsd
n| j| jd|	\}}|||d }| |}|rd }|||fS )Nr   r   r   r   )r   r  eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r   )r   r   r6   )r   sizer   r?   r   r   r   r   r   r>   r   maxr   updater   r   r   r   r   r   _attn_implementationr   r   r   r   r   r   reshaper   r   )rm   r%   r   r<   r   r   r   r   r  rn   r   r   q_len_query_states
key_statesvalue_stateskv_len
kv_seq_lenr   r   cache_kwargsattention_interfacer   r   r0   r0   r1   ry   @  s\   ""$" 





zIdeficsAttention.forward)r   FNFNNNNNFFN)r(   r)   r*   r+   r   r   r}   r   r   ri   r,   r   r   
LongTensorr/   ry   r   r0   r0   ro   r1   r     sb    Q	r   c                       s   e Zd Zddedee f fddZ						ddejdeej d	eej	 d
ee
ej  dee dee deej	 de
ejee
ejejf  f fddZ  ZS )IdeficsDecoderLayerNr   r   c                    sr   t    |j| _t| j|j|j||d| _t| j|j|j	d| _
t|j|jd| _t|j|jd| _|j| _d S )N)r   r   r   r   r   r   r   r   r   )rh   ri   r   r   num_attention_headsr   	self_attnr   r   r   mlpr   r   input_layernormpost_attention_layernormrm   r   r   ro   r0   r1   ri     s"   
zIdeficsDecoderLayer.__init__Fr%   r<   r   r   r   r   r  rb   c              
   K   s   |}	|  |}| jd|||||||d|\}}
}tjj|| j| jd}|	| }|}	| |}| |}tjj|| j| jd}|	| }|f}|rN||
f7 }|rU||f7 }|S )a]  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        )r%   r<   r   r   r   r   r  r   Nr0   )r  r  r   r   r   r   r  r  )rm   r%   r<   r   r   r   r   r  rn   residualself_attn_weightspresent_key_valueoutputsr0   r0   r1   ry     s4   





zIdeficsDecoderLayer.forwardrT   )NNNFFN)r(   r)   r*   r   r   r   ri   r,   r   r  r/   r}   r-   ry   r   r0   r0   ro   r1   r    s2    
r  c                       s   e Zd Zddedee f fddZ							ddejdeej d	eej d
eej deej dee	 dee	 dee
ej  de
ejee
ejejf  f fddZ  ZS )IdeficsGatedCrossAttentionLayerNr   r   c              	      s~  t    |j| _t| j|jd|j||j|d| _t| j|j	|j
d| _t|j|jd| _t|j|jd| _|j| _t | _t | _|jdkr|jdkrgttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _ntd	|j d
|jdkr|jdkrttdd| j| _ttdd| j| _n|jdkrttd| _ttd| _njtd	|j d
|jdv r$|jdkrttjd|jdd| jfd| _ttjd|jdd| jfd| _n3|jdkrttjd|jdd| _ttjd|jdd| _ntd	|j d
t d|j dt!| dr9t!| ds=tdd S )NT)r   r   r   r   r   r   r   r  r   zerosvectorr   r   z Unknown value for `alpha_type` ()r   >   normalrandomgaussianr   )r   stdr  zAlpha initialization scheme z not yet implemented!alpha_cross_attnalpha_densez+Alpha parameters not initialized correctly!)"rh   ri   r   r   r  r   r   
cross_attnr   r   r   r  r   r   r  r  r   r   Tanhact_cross_attn	act_densealpha_initializer
alpha_typer   r,   r!  r(  r)  rg   r   r$  alphas_initializer_rangeNotImplementedErrorr   r  ro   r0   r1   ri     sn   
	








z(IdeficsGatedCrossAttentionLayer.__init__Fr%   r<   r'   r:   cross_attention_gater   r   r   rb   c	                 K   s  |du rt d|du rt d|durtd|}
| |}| jd	||||d|	\}}}tjj|| j| jd}|	|dkdddddf d}|
| 
| j|  }|}
| |}| |}tjj|| j| jd}|
| | j|  }|f}|r||f7 }|r||f7 }|S )
a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            cross_attention_gate (`torch.FloatTensor`, *optional*):
                gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        Nzt`image_hidden_states` is required for Idefics cross attention module which are visual features to be conditioned on.z`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images.zMPast key value states are not implemented for Idefics cross attention module.)r%   r   r<   r   r   r   r   r0   )rg   r1  r  r*  r   r   r   r   r   masked_fillr,  r(  r  r  r-  r)  )rm   r%   r<   r'   r:   r2  r   r   r   rn   r  r  r  r  r0   r0   r1   ry   "  sD   
"



z'IdeficsGatedCrossAttentionLayer.forwardrT   r  )r(   r)   r*   r   r   r   ri   r,   r   r}   r/   r-   ry   r   r0   r0   ro   r1   r     s8    E	r   c                   @   s<   e Zd ZeZdZdZddgZdZdZ	dZ
dZdZdd ZdS )	IdeficsPreTrainedModelr\   Tr  r   Fc                 C   s  | j j}t|tjtjfr%|jjjd|d |j	d ur#|j	j
  d S d S t|tjrF|jjjd|d |jd urD|jj|j 
  d S d S t|tjr[|jjd |j	j
  d S t|tri|jjd d S t|trv|jj  d S t|tr| j jdkr|jj
  |jj
  d S | j jdkr|jjd |jjd d S | j jdv r|jjjd| j jd |jjjd| j jd d S d S t|tr|jj  d S d S )Nr   )r   r'  r   r!  r   >   r$  r%  r&  )r   initializer_rangerU   r   rM   Conv2drk   datanormal_r   zero_rN   rf   rL   fill_r   r   class_embeddingr   r.  r(  r)  r0  r   latents)rm   rW   r'  r0   r0   r1   _init_weights|  s@   





z$IdeficsPreTrainedModel._init_weightsN)r(   r)   r*   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_cache_class_supports_flash_attn_2_supports_static_cache_supports_attention_backendr=  r0   r0   r0   r1   r4  p  s    r4  c                   @   s   e Zd ZdS )KwargsForCausalLMN)r(   r)   r*   r0   r0   r0   r1   rG    s    rG  c                '       s  e Zd ZdZdef fddZd.ddZg fdd	Zg fd
dZdd Z	dd Z
ee															d/deej deej deej deeej  deej deej deej deej deej dee dee dee dee dee deej d ee d!eeef f"d"d#Z	d0deejd$f d%ejdejdedef
d&d'Zedejd(ed)ed*ejdejd+efd,d-Z  Z S )1IdeficsModelz
    Transformer decoder consisting of `config.num_hidden_layers` layers. Each layer is a [`IdeficsDecoderLayer`]

    Args:
        config: IdeficsConfig
    r   c                    s
  t     | _ j| _ j| _t j j j j	| jd| _
 jj| _ j| _t j| _ jrE j}t  jj|j|j|j|j| _t fddt jD | _ j| _ j| j }t fddt|D | _d| _t  j j!d| _"| #  | $  d S )N)rc   rj   rd   ra   rf   c                       g | ]}t  |d qS )r   )r  rO   ir   r0   r1   rR         z)IdeficsModel.__init__.<locals>.<listcomp>c                    rI  rJ  )r   rK  rM  r0   r1   rR     rN  Fr   )%rh   ri   r   pad_token_idrf   
vocab_sizer`   additional_vocab_sizer   freeze_text_layersembed_tokensr   
image_sizer   vision_modeluse_resamplerperceiver_configr   r   resampler_depthresampler_n_headsresampler_head_dimresampler_n_latentsperceiver_resamplerr   
ModuleListrangenum_hidden_layerslayerscross_layer_intervalgated_cross_attn_layersgradient_checkpointingr   r   norm	post_initfreeze_relevant_params)rm   r   rW  num_cross_layersro   rM  r1   ri     sH   
	zIdeficsModel.__init__Nc                 C   s>   |d u r| j }|jr| |j |jrt| j|jd d S d S N)r]   )r   rR  freeze_text_module_exceptionsfreeze_vision_layersr_   rU  freeze_vision_module_exceptions)rm   r   r0   r0   r1   rf    s   z#IdeficsModel.freeze_relevant_paramsc                 C   s"   | j | jfD ]}t||d qd S rh  )r`  rd  r_   )rm   r]   rW   r0   r0   r1   rR    s   zIdeficsModel.freeze_text_layersc                 C   s   t | j|d d S rh  )r_   rU  )rm   r]   r0   r0   r1   rj    s   z!IdeficsModel.freeze_vision_layersc                 C      | j S rT   rS  r{   r0   r0   r1   get_input_embeddings     z!IdeficsModel.get_input_embeddingsc                 C   
   || _ d S rT   rm  rm   r   r0   r0   r1   set_input_embeddings     
z!IdeficsModel.set_input_embeddingsFrE   r<   r   r$   inputs_embedsr7   r8   r9   r:   r   r   output_hidden_statesinterpolate_pos_encodingreturn_dictr  rn   rb   c           +   
   K   s  |dur|j n|j }|dur|n| jj}|dur|n| jj}|
dur$|
n| jj}
|dur.|n| jj}|du |duA r>td| jrM| jrM|
rMt	
d d}
|du rV| |}d}|
rst|tssd}|du rit }n
t|}t	
d |j\}}}|dur| nd}|| }|du rtj|||jd  |j d	}|dur|du r| d
d }||dkd |dd| df }n	|du r|d}tdd |||fD dkrtd|dur
|j| j|d}|jdd \}}| j|| g|jdd R  }| j||dj}n|dur(| \}}}}|j| j|d}||| ||}| jj rN|du rC| !|}|d|d}}n| \}}}}|}n|du r_|d|d}}ntd|||| |}|	d}|	d
}	|	"ddd|}	|	|||| }	|dur| \}}}||f}|	du rtj#||d	}	| $|	}	nd}	|	dkj%d
dj| jdj&dd|} |du rtj#||ftj'|j d}| (|||||}|}!|rdnd}"|rdnd}#d}$t)| j*D ]Z\}%}&|r|"|!f7 }"|%| j+ dkr | j,|%| j+  }'|'|!||f|	| ||
dd|}(|(d }!|&|!f|||||
|d|})|)d }!|
r?|)|r<dnd }$|rI|#|)d f7 }#q| -|!}!|rX|"|!f7 }"|
r]|$nd}*|rf|*. }*|||||}t/|!|*|"|#|dS )ab  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r   rB   r6   c                 S   s   g | ]}|d u qS rT   r0   )rO   r   r0   r0   r1   rR   C  rS   z(IdeficsModel.forward.<locals>.<listcomp>r   z_Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None.)re   rB   )r7   rv  zBIf `perceiver_embeddings` are passed, use_resampler should be Truer   r   r   r0   )r:   r2  r   r   r   )r<   r   r   r   r   r  )r#   r$   r%   r&   r'   )0rB   r   r   ru  r   use_return_dictrg   rc  r   r   r   rS  rU   r
   r   from_legacy_cacher>   get_seq_lengthr,   r=   longcumsummasked_fill_r   sumrA   re   r   r?   rU  r#   r  rV  r\  r@   r   invert_attention_maskrZ   squeezer}   _update_causal_mask	enumerater`  ra  rb  rd  to_legacy_cacher"   )+rm   rE   r<   r   r$   rt  r7   r8   r9   r:   r   r   ru  rv  rw  r  rn   rB   return_legacy_cache
batch_size
seq_lengthr
  past_key_values_lengthseq_length_with_past
num_imagesr'   image_seq_lenimage_hidden_sizetext_seq_lenimage_batch_sizeimage_sequence_lengthimage_hidden_shaper2  r%   all_hidden_statesall_self_attnsnext_decoder_cacheidxdecoder_layercross_attn_blockr  layer_outputs
next_cacher0   r0   r1   ry     s  



$








"


	


zIdeficsModel.forwardr   input_tensorc                 C   s:  | j jdkr|d ur|dk r|S d S | j jdkr&t|tjr$t|}|S |d ur.| nd}|d ur7|jnd}| j jdkrO|sO|sOt	j
|||| jdrOd S |j}|jd }	|r^| }
nt|tjri|jd	 n||	 d }
| j||	|
|||jd d
}| j jdkr|d ur|jjdv r|st|j}t	||}|S )Nflash_attention_2r   flex_attentionr   Fr  )rt  r  is_trainingr   r6   )sequence_lengthtarget_lengthre   r  r  )cudaxpunpu)r   r  rZ   rU   r,   r   r    r{  is_compileabler   _ignore_causal_mask_sdpar   re   r>   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrB   typefinfomin_unmask_unattended)rm   r<   r  r  r$   r   past_seen_tokensusing_compilable_cachere   r  r  causal_mask	min_dtyper0   r0   r1   r    sT   




z IdeficsModel._update_causal_maskr  r  re   r  c                 K   sD  | dur|   dkr| }|S t|j}tj||f|||jd}|dkr+tj|dd}|tj||jd|ddk9 }|ddddddf 	|ddd}| dur|
 }| jd }	|ddddddd|	f | ddddddf |j }
|
dk}
|ddddddd|	f |
||ddddddd|	f< |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuere   rB   r   )diagonalrx  r6   r   )r   r,   r  r  fullrB   triur=   r  expandrs   r>   rA   r3  )r<   r  r  re   r  r  rn   r  r  mask_lengthpadding_maskr0   r0   r1   r    s,    $
6  zBIdeficsModel._prepare_4d_causal_attention_mask_with_cache_positionrT   )NNNNNNNNNNNNFNNF)!r(   r)   r*   r+   r   ri   rf  rR  rj  rn  rr  r   r   r   r,   r  r   r5   r-   r}   r   r   r   r/   r"   ry   r
   r  staticmethodr   re   r  r   r0   r0   ro   r1   rH    s    
0
	

 ]
DrH  c                )       s  e Zd ZddgZd2 fdd	Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zee																d3deej deej deej deeej  deej deej deej deej deej deej dee d ee d!ee d"ee d#ee d$eej d%ee d&eeef f$d'd(Z									d4 fd)d*	Z	d5d+ed,eeef d-ed&eeef f fd.d/Z e!d0d1 Z"  Z#S )6IdeficsForVisionText2Textzmodel.embed_tokens.weightzlm_head.weightNc                    s>   t  | t|| _t|j|j|jd|jd| _	| 
  d S )NFr   )rh   ri   rH  r\   r   r   rP  rQ  freeze_lm_headlm_headre  )rm   r   rU  ro   r0   r1   ri   I  s   
	z"IdeficsForVisionText2Text.__init__c                 C   s   | j jS rT   r\   rS  r{   r0   r0   r1   rn  X  s   z.IdeficsForVisionText2Text.get_input_embeddingsc                 C   s   || j _d S rT   r  rq  r0   r0   r1   rr  [  s   z.IdeficsForVisionText2Text.set_input_embeddingsc                 C   rl  rT   r  r{   r0   r0   r1   get_output_embeddings^  ro  z/IdeficsForVisionText2Text.get_output_embeddingsc                 C   rp  rT   r  )rm   new_embeddingsr0   r0   r1   set_output_embeddingsa  rs  z/IdeficsForVisionText2Text.set_output_embeddingsc                 C   rp  rT   r\   )rm   decoderr0   r0   r1   set_decoderd  rs  z%IdeficsForVisionText2Text.set_decoderc                 C   rl  rT   r  r{   r0   r0   r1   get_decoderg  ro  z%IdeficsForVisionText2Text.get_decoderc                 C   s   |   }|  }t| jddr&|j|_|jdkr&|j|jks J |jj|j_t	|drDt	|drF|j
|_t	|drHt	|drJ|j|_dS dS dS dS dS )	z
        Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of
        IdeficsDecoupledLinear and IdeficsDecoupledEmbedding.
        tie_word_embeddingsTr   r   rc   r   rj   N)r  rn  getattrr   rk   rj   r   rl   r   r   rc   r   )rm   output_embeddingsinput_embeddingsr0   r0   r1   tie_weightsj  s   
z%IdeficsForVisionText2Text.tie_weightsFrE   r<   r   r$   rt  r7   r8   r9   r:   labelsr   r   ru  rv  rw  r  rn   rb   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| jd|||||||||	||||d|d|}|d }| |}d}|
durS| jd||
| j jd|}t|||j	|j
|j|jdS )aC  
        image_encoder_embeddings (`torch.FloatTensor`, *optional*):
            The output of the image encoder.
        perceiver_embeddings (`torch.FloatTensor`, *optional*):
            The output of the perceiver resampler.
        image_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask for the image encoder.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoProcessor, IdeficsForVisionText2Text

        >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b")
        >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b")

        >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg"
        >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg"

        >>> prompts = [
        ...     [
        ...         "User:",
        ...         dogs_image_url_1,
        ...         "Describe this image.\nAssistant: An image of two dogs.\n",
        ...         "User:",
        ...         dogs_image_url_2,
        ...         "Describe this image.\nAssistant:",
        ...     ]
        ... ]
        >>> inputs = processor(prompts, return_tensors="pt")
        >>> generate_ids = model.generate(**inputs, max_new_tokens=6)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)
        ```NT)rE   r<   r   r$   rt  r7   r8   r9   r:   r   r   ru  rv  rw  r  r   )r4   r  rP  )r3   r4   r$   r%   r&   r'   r0   )r   r   ru  ry  r\   r  loss_functionrP  r2   r$   r%   r&   r'   )rm   rE   r<   r   r$   rt  r7   r8   r9   r:   r  r   r   ru  rv  rw  r  rn   r  r%   r4   r3   r0   r0   r1   ry     sH   <
z!IdeficsForVisionText2Text.forwardc              
      s   i }|d ur| j jr||d< n	||d< n||d< |dd|d< t j|f||||||
|	d||}|	d urP|d u rP|d jd }|	d d | d f |d	< |S )
Nr9   r8   r7   rv  F)r$   r<   rt  r  r   r   r:   rE   r   r:   )r   rV  poprh   prepare_inputs_for_generationr>   )rm   rE   r<   r   rt  r$   r  r7   r'   r:   r   rn   images_kwargsmodel_inputsr  ro   r0   r1   r    s4   

	
z7IdeficsForVisionText2Text.prepare_inputs_for_generationr  rI   rG   c                    s~   t  j|||fi |}d|v r8|d }|d d dd d f d}|ddr-||d< ntj||gdd|d< |j|d< |S )Nr:   r6   r   r   Tr   r'   )rh   #_update_model_kwargs_for_generationr   rD   r,   r   r'   )rm   r  rI   rG   rn   r:   	last_maskro   r0   r1   r    s   

z=IdeficsForVisionText2Text._update_model_kwargs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr0   c                 3   s    | ]	}| d  V  qdS )r   N)rC   )rO   
past_statebeam_idxr0   r1   rX   0  s    z;IdeficsForVisionText2Text._reorder_cache.<locals>.<genexpr>)r/   )pastr  reordered_past
layer_pastr0   r  r1   _reorder_cache,  s   z(IdeficsForVisionText2Text._reorder_cacherT   )NNNNNNNNNNNNNFNN)	NNNNNNNNNr  )$r(   r)   r*   _tied_weights_keysri   rn  rr  r  r  r  r  r  r   r   r   r,   r  r   r5   r-   r}   r   rG  r   r/   r2   ry   r  r   dictr~   r   r  r  r  r   r0   r0   ro   r1   r  F  s    	

g1

r  )r  rH  r4  )r   FNN)r   )r   )Pr+   dataclassesr   typingr   r   r   r   r,   torch.nn.functionalr   r   rq   torch.utils.checkpointactivationsr	   cache_utilsr
   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr   r   r   processing_utilsr   utilsr   r   r   r   r   configuration_ideficsr   	perceiverr   visionr   r   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr    
get_loggerr(   r   r"   r2   rK   r_   rN   r`   rM   r   Moduler   r   r   r   r   r   r   r   r   r  r   r4  rG  rH  r  __all__r0   r0   r0   r1   <module>   s   

-i<'

 &Q -   ' o