o
    ei]                    @   s6  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$ e%e&Z'dZ(dZ)dZ*e$e"B e#B Z+eeddG dd deZ,eeddG dd deZ-eeddG dd deZ.G dd de	j/Z0G dd  d e	j/Z1G d!d" d"e	j/Z2G d#d$ d$e	j/Z3G d%d& d&e	j/Z4G d'd( d(e	j/Z5G d)d* d*e	j/Z6G d+d, d,e	j/Z7G d-d. d.eZ8G d/d0 d0e	j/Z9G d1d2 d2e	j/Z:eG d3d4 d4eZ;eG d5d6 d6e;Z<eG d7d8 d8e;Z=eG d9d: d:e;Z>eG d;d< d<e;Z?G d=d> d>e	j/Z@G d?d@ d@e	j/ZAG dAdB dBe	j/ZBedCdG dDdE dEe;ZCG dFdG dGe	j/ZDG dHdI dIe	j/ZEG dJdK dKe	j/ZFG dLdM dMe	j/ZGedNdG dOdP dPe;ZHg dQZIdS )RzPyTorch FLAVA model.    N)OrderedDict)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )FlavaConfigFlavaImageCodebookConfigFlavaImageConfigFlavaMultimodalConfigFlavaTextConfigzfacebook/flava-image-codebookg$(~k@a  
    Output from FlavaModel containing embeddings and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
    )custom_introc                   @   s   e Zd ZU dZdZejdB ed< dZe	dB ed< dZ
ejdB ed< dZe	dB ed< dZejdB ed< dZe	dB ed< d	ee fd
dZdS )FlavaModelOutputa  
    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`].
    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
        The output of the [`FlavaTextModel`].
    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
        The output of the [`FlavaMultimodalModel`].
    Nimage_embeddingsimage_outputtext_embeddingstext_outputmultimodal_embeddingsmultimodal_outputreturnc                    s   t  fdd  D S )Nc                 3   s.    | ]}|d vr | nt  | V  qdS ))r   r   r!   Ngetattrto_tuple.0kself f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/flava/modeling_flava.py	<genexpr>U   s
    
z,FlavaModelOutput.to_tuple.<locals>.<genexpr>tuplekeysr)   r+   r)   r,   r%   T   s   zFlavaModelOutput.to_tuple)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   r    r!   r/   r   r%   r+   r+   r+   r,   r   3   s   
 r   z@
    Class representing pretraining losses from FLAVA model
    c                   @   s   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
ejdB ed< dZejdB ed< dZejdB ed< d	efd
dZdS )FlavaLossesa  
    mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.):
        Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
    mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.):
        Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
    itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.):
        Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
        masked pairs in FLAVA.
    global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.):
        Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
        data. This is calculated on unmasked images and texts.
    mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.):
        Masked Multimodal Modeling loss's image component calculated on paired image-text data.
    mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.):
        Masked Multimodal Modeling loss's text component calculated on paired image-text data.
    Nmimmlmitmglobal_contrastive	mmm_imagemmm_textr"   c                 C   s(   d}|   D ]}|d urd} |S q|S )NTF)values)r*   all_nonevr+   r+   r,   r@   z   s   zFlavaLosses.all_none)r1   r2   r3   r4   r9   r5   r6   r7   r:   r;   r<   r=   r>   boolr@   r+   r+   r+   r,   r8   [   s   
 r8   a  
    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
    c                   @   s  e Zd ZU dZdZejdB ed< dZe	ed< dZ
ejdB ed< dZedB ed< dZejdB ed< dZedB ed< dZejdB ed	< dZedB ed
< dZejdB ed< dZedB ed< dZejdB ed< dZedB ed< dZejdB ed< dZedB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dZejdB ed< dee fddZ dS )FlavaForPreTrainingOutputay  
    loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
        Total loss calculated for this model.
    loss_info (`FlavaLosses`):
        Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
        the keys.
    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`].
    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
        The output of the [`FlavaTextModel`].
    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
        The output of the [`FlavaMultimodalModel`].
    image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
        to create masked images.
    image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
    text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
        The output of the [`FlavaTextModel`].
    multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
        The output of the [`FlavaMultimodalModel`].
    mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
        The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
            returned when `bool_masked_pos` has some of the patches masked.
    mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
        The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
            the tokens masked.
    itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
        The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
    contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
        `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
        scores. This is calculated on unmasked images and texts.
    contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
        `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
        texts.
    mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
        The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
            output is returned when `bool_masked_pos` has some of the patches masked.
    mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
        The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
            some of the tokens masked.
    Nloss	loss_infor   r   r   r   r    r!   image_masked_embeddingsimage_masked_outputtext_masked_embeddingstext_masked_outputmultimodal_masked_embeddingsmultimodal_masked_output
mim_logits
mlm_logits
itm_logitscontrastive_logits_per_imagecontrastive_logits_per_textmmm_image_logitsmmm_text_logitsr"   c                    s$   g dt  fdd  D S )N)r   r   r!   rI   rG   rK   c                 3   s.    | ]}|vr | nt  | V  qd S Nr#   r&   r*   transformer_outputsr+   r,   r-      s   , z5FlavaForPreTrainingOutput.to_tuple.<locals>.<genexpr>r.   r)   r+   rT   r,   r%      s   z"FlavaForPreTrainingOutput.to_tuple)!r1   r2   r3   r4   rD   r5   r6   r7   rE   r8   r   r   r   r   r   r    r!   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   r/   r   r%   r+   r+   r+   r,   rC      s0   
 7rC   c                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdejdB dedejfddZ  ZS )FlavaImageEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenr"   Nc                    s   t    |p	|j}ttdd|j| _|r#ttdd|jnd | _t	|j
|j|j|jd| _| jj}ttd|d |j| _t|j| _|j| _|| _d S )Nr   )
image_size
patch_sizenum_channels	embed_dim)super__init__
mask_tokenr   	Parameterr5   zeroshidden_size	cls_tokenPatchEmbeddingsrY   rZ   r[   patch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutrW   )r*   rW   rX   rf   	__class__r+   r,   r^      s   

 
zFlavaImageEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdim)shaperg   r5   jit
is_tracingrZ   r   reshapepermuter   
functionalinterpolateviewcat)r*   rm   rn   ro   rf   num_positionsclass_pos_embedpatch_pos_embedrw   
new_height	new_widthsqrt_num_positionsr+   r+   r,   interpolate_pos_encoding  s(   



z-FlavaImageEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posr   c                 C   s   |j \}}}}| j||d}| \}}	}
|d urB| j||	d}| dkr0||dd}|d|}|d|  ||  }| j	|dd}t
j||fdd}|r_|| ||| }n|| j }| |}|S )N)r   rp   r   r   g      ?r   rv   )rx   re   rs   r_   expandrw   r   	unsqueezetype_asrc   r5   r   r   rg   rj   )r*   r   r   r   
batch_sizer[   rn   ro   rm   seq_len_mask_tokensmask
cls_tokensr+   r+   r,   forward)  s    

zFlavaImageEmbeddings.forwardFNF)r1   r2   r3   r4   r   rB   r^   r5   Tensorintr   
BoolTensorr   __classcell__r+   r+   rk   r,   rV      s    +rV   c                	       sd   e Zd ZdZ				ddedeeeef B ded	ef fd
dZddejde	dejfddZ
  ZS )rd   z#
    Image to Patch Embedding.
          r      rY   rZ   r[   r\   c                    s   t    t|tjjs||f}t|tjjs||f}|d |d  |d |d   }|| _|| _|| _t	j
||||d| _d S )Nr   r   )kernel_sizestride)r]   r^   
isinstancecollectionsabcIterablerY   rZ   rf   r   Conv2d
projection)r*   rY   rZ   r[   r\   rf   rk   r+   r,   r^   R  s   
 zPatchEmbeddings.__init__Fr   r   r"   c              
   C   sx   |j \}}}}|s.|| jd ks|| jd kr.td| d| d| jd  d| jd  d	| |ddd}|S )Nr   r   zInput image size (*z) doesn't match model (z).rq   )rx   rY   
ValueErrorr   flatten	transpose)r*   r   r   r   r[   rn   ro   xr+   r+   r,   r   e  s   zPatchEmbeddings.forward)r   r   r   r   r   )r1   r2   r3   r4   r   r/   r^   r5   r   rB   r   r   r+   r+   rk   r,   rd   M  s     $rd   c                       sP   e Zd ZdZ fddZ			d
dejdB dejdB dejdB fdd	Z  ZS )FlavaTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxepsposition_idsr   rp   F)
persistenttoken_type_ids)dtype)r]   r^   r   	Embedding
vocab_sizerb   pad_token_idword_embeddingsmax_position_embeddingsrg   type_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsrh   ri   rj   register_bufferr5   aranger   ra   r   rs   longr*   rW   rk   r+   r,   r^   t  s   

zFlavaTextEmbeddings.__init__N	input_idsr   r   c                 C   s   |  }|d }|d u r| jd d d |f }|d u rAt| dr6| jd d d |f }||d |}|}ntj|tj| jjd}| 	|}| 
|}	||	 }
| |}|
|7 }
| |
}
| |
}
|
S )Nr   r   r   )r   device)rs   r   hasattrr   r   r5   ra   r   r   r   r   rg   r   rj   )r*   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedinputs_embedsr   rm   rg   r+   r+   r,   r     s$   





zFlavaTextEmbeddings.forwardNNN)	r1   r2   r3   r4   r^   r5   r   r   r   r+   r+   rk   r,   r   q  s    r   c                       d   e Zd Zdeddf fddZ		ddejdejdB d	edeejejf eej B fd
dZ	  Z
S )FlavaSelfAttentionrW   r"   Nc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)r]   r^   rb   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluerh   attention_probs_dropout_probrj   r   rk   r+   r,   r^     s   

zFlavaSelfAttention.__init__Fhidden_statesattention_maskoutput_attentionsc                 C   s  |j \}}}| ||d| j| jdd}| ||d| j| jdd}| ||d| j| jdd}	t	||dd}
|
t
| j }
|d urS|
| }
tjj|
dd}| |}t	||	}|dddd }| d d | jf }|j| }|r||f}|S |f}|S )Nrp   r   rq   rv   r   r   )rx   r   r   r   r   r   r   r   r5   matmulmathsqrtr   r}   softmaxrj   r|   
contiguousrs   r   )r*   r   r   r   r   r   r   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr+   r+   r,   r     s6   

zFlavaSelfAttention.forwardr   r1   r2   r3   FlavaPossibleConfigsr^   r5   r   rB   r/   r   r   r+   r+   rk   r,   r     s    r   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )FlavaSelfOutputz
    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
    models), due to the layernorm applied before each block.
    rW   r"   Nc                    s.   t    t|j|j| _t|j| _d S rS   )	r]   r^   r   r   rb   denserh   ri   rj   r   rk   r+   r,   r^        
zFlavaSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S rS   r   rj   r*   r   r   r+   r+   r,   r        

zFlavaSelfOutput.forward)
r1   r2   r3   r4   r   r^   r5   r   r   r   r+   r+   rk   r,   r     s    $r   c                       r   )FlavaAttentionrW   r"   Nc                    s"   t    t|| _t|| _d S rS   )r]   r^   r   	attentionr   outputr   rk   r+   r,   r^     s   

zFlavaAttention.__init__Fr   r   r   c                 C   s6   | j |||d}| |d |}|f|dd   }|S N)r   r   r   r   )r   r   )r*   r   r   r   self_outputsattention_outputr   r+   r+   r,   r     s   zFlavaAttention.forwardr   r   r+   r+   rk   r,   r     s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	FlavaIntermediaterW   r"   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S rS   )r]   r^   r   r   rb   intermediate_sizer   r   
hidden_actstrr   intermediate_act_fnr   rk   r+   r,   r^     s
   
zFlavaIntermediate.__init__r   c                 C   r   rS   )r   r   r*   r   r+   r+   r,   r     r   zFlavaIntermediate.forward	r1   r2   r3   r   r^   r5   r   r   r   r+   r+   rk   r,   r     s    	r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )
FlavaOutputrW   r"   Nc                    s.   t    t|j|j| _t|j| _	d S rS   )
r]   r^   r   r   r   rb   r   rh   ri   rj   r   rk   r+   r,   r^   %  r   zFlavaOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S rS   r   r   r+   r+   r,   r   +  s   

zFlavaOutput.forwardr   r+   r+   rk   r,   r   $  s    $r   c                       sh   e Zd ZdZdeddf fddZ		ddejd	ejdB d
ede	ejejf e	ej B fddZ
  ZS )
FlavaLayerz?This corresponds to the Block class in the timm implementation.rW   r"   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   r   )r]   r^   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   r   rb   r   layernorm_beforelayernorm_afterr   rk   r+   r,   r^   7  s   



zFlavaLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S r   )r   r  r  r  r   )r*   r   r   r   self_attention_outputsr   r   layer_outputr+   r+   r,   r   C  s   


zFlavaLayer.forwardr   )r1   r2   r3   r4   r   r^   r5   r   rB   r/   r   r   r+   r+   rk   r,   r  4  s    r  c                       s^   e Zd Zdeddf fddZ				ddejd	ejdB d
edededee	B fddZ
  ZS )FlavaEncoderrW   r"   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r+   )r  r'   r   rW   r+   r,   
<listcomp>d  s    z)FlavaEncoder.__init__.<locals>.<listcomp>F)	r]   r^   rW   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   rk   r  r,   r^   a  s   
 
zFlavaEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ]\}}	|r||f }|	|||}
|
d }|r/||
d f }q|r7||f }|sEtdd |||fD S t|||dS )Nr+   r   r   c                 s   s    | ]	}|d ur|V  qd S rS   r+   )r'   rA   r+   r+   r,   r-         z'FlavaEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater  r/   r
   )r*   r   r   r   r  r  all_hidden_statesall_self_attentionsilayer_modulelayer_outputsr+   r+   r,   r   g  s"   

zFlavaEncoder.forward)NFFT)r1   r2   r3   r   r^   r5   r   rB   r/   r
   r   r   r+   r+   rk   r,   r	  `  s&    	r	  c                       s2   e Zd Zdef fddZdejfddZ  ZS )FlavaPoolerrW   c                    s*   t    t|j|j| _t | _d S rS   )r]   r^   r   r   rb   r   Tanh
activationr   rk   r+   r,   r^     s   
zFlavaPooler.__init__r   c                 C   s(   |d d df }|  |}| |}|S Nr   )r   r  )r*   r   first_token_tensorpooled_outputr+   r+   r,   r     s   

zFlavaPooler.forwardr   r+   r+   rk   r,   r    s    r  c                       sR   e Zd ZU eed< dZdZdZe	 de
je
jB e
jB ddf fdd	Z  ZS )
FlavaPreTrainedModelrW   flava)imagetextTmoduler"   Nc                    s   t  | t|trt|j dS t|tr3t|j t|j	 |j
dur1t|j
 dS dS t|trQt|jt|jjd d t|j dS t|trc|jrat|j dS dS t|trst|j| jj dS dS )zInitialize the weightsNrp   r   )r]   _init_weightsr   FlavaMaskedPredictionHeadinitzeros_r   rV   rc   rg   r_   r   copy_r   r5   r   rx   r   r   FlavaMultimodalModeluse_cls_token
FlavaModel	constant_logit_scalerW   logit_scale_init_value)r*   r'  rk   r+   r,   r(    s&   



"

z"FlavaPreTrainedModel._init_weights)r1   r2   r3   r   r7   base_model_prefixinput_modalitiessupports_gradient_checkpointingr5   no_gradr   r   r   r   r(  r   r+   r+   rk   r,   r#    s   
 .r#  c                       s   e Zd ZU eed< dZdZdZddedef fddZ	d	e
jfd
dZde
jfddZe							ddejdB dejdB dedB dejdB dedB dedB dedB d	eeB fddZ  ZS )FlavaImageModelrW   zflava.image_modelr   r%  Tadd_pooling_layerc                    X   t  | || _t|| _t|| _tj|j	|j
d| _|r#t|nd| _|   dS v
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r]   r^   rW   rV   rm   r	  encoderr   r   rb   r   	layernormr  pooler	post_initr*   rW   r9  rk   r+   r,   r^     s   

zFlavaImageModel.__init__r"   c                 C      | j jS rS   rm   re   r)   r+   r+   r,   get_input_embeddings     z$FlavaImageModel.get_input_embeddingsr   c                 C      || j _d S rS   rC  r*   r   r+   r+   r,   set_input_embeddings     z$FlavaImageModel.set_input_embeddingsNr   r   r   r   r  r  c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| j|||d}	| j|	||||d}
|
d }| |}| jdurK| |nd}|sY||f|
dd  S t	|||
j
|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r   r   r   r   r  r  r   r   r  pooler_outputr   r  )rW   r   r  use_return_dictr   rm   r=  r>  r?  r   r   r  )r*   r   r   r   r   r   r  r  kwargsembedding_outputencoder_outputssequence_outputr"  r+   r+   r,   r     s6   
zFlavaImageModel.forwardTNNNNNNN)r1   r2   r3   r   r7   r3  main_input_namer4  rB   r^   r   ModulerD  rH  r   r5   r   r   r/   r   r   r   r+   r+   rk   r,   r7    sB   
 
r7  c                       s   e Zd ZU eed< dZdZddedef fddZde	fd	d
Z
dejfddZe							ddejdB dejdB dejdB dejdB dedB dedB dedB deeB fddZ  ZS )FlavaTextModelrW   zflava.text_model)r&  Tr9  c                    r:  r;  )r]   r^   rW   r   rm   r	  r=  r   r   rb   r   r>  r  r?  r@  rA  rk   r+   r,   r^     s   

zFlavaTextModel.__init__r"   c                 C   rB  rS   rm   r   r)   r+   r+   r,   rD    rE  z#FlavaTextModel.get_input_embeddingsr   c                 C   rF  rS   rW  rG  r+   r+   r,   rH    rI  z#FlavaTextModel.set_input_embeddingsNr   r   r   r   r   r  r  c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| }	|du r6tj|	|jd}| 	||	}
| j
|||d}| j||
|||d}|d }| |}| jdura| |nd}|so||f|dd  S t|||j|jdS )	a  
        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        NzYou have to specify input_idsr   )r   r   r   rJ  r   r   rK  )rW   r   r  rM  r   rs   r5   onesr   get_extended_attention_maskrm   r=  r>  r?  r   r   r  )r*   r   r   r   r   r   r  r  rN  r   extended_attention_maskrO  rP  rQ  r"  r+   r+   r,   r   !  sH   
zFlavaTextModel.forwardrR  rS  )r1   r2   r3   r   r7   r3  r4  rB   r^   rd   rD  r   rU  rH  r   r5   r   r/   r   r   r   r+   r+   rk   r,   rV    s@   
 
rV  c                       s~   e Zd ZU eed< dZdZddef fddZe				dde	j
de	j
dB d	edB d
edB dedB deeB fddZ  ZS )r-  rW   zflava.multimodal_modelr   Tc                    sv   t  | || _| jj| _| jrttdd|j| _	t
|| _tj|j|jd| _|r2t|nd| _|   dS )r<  r   r   N)r]   r^   rW   r.  r   r`   r5   ra   rb   rc   r	  r=  r   r   r>  r  r?  r@  rA  rk   r+   r,   r^   o  s   

zFlavaMultimodalModel.__init__Nr   r   r  r  r"   c                 K   s  |dur|n| j j}|dur|n| j j}|dur|n| j j}| \}}}	| jr=| j|dd}
tj	|
|fdd}|d7 }|du rKtj
||f|jd}| |||f}| j|||||d}|d }| |}| jdurp| |nd}|s~||f|dd  S t|||j|jdS )	z
        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
            The concatenated hidden states of unimodal encoders.
        Nrp   r   rv   rX  rJ  r   rK  )rW   r   r  rM  rs   r.  rc   r   r5   r   rY  r   rZ  r=  r>  r?  r   r   r  )r*   r   r   r   r  r  rN  r   r   r   r   r[  rP  rQ  r"  r+   r+   r,   r     sB   
zFlavaMultimodalModel.forwardrR  )NNNN)r1   r2   r3   r   r7   r3  rT  r^   r   r5   r   rB   r/   r   r   r   r+   r+   rk   r,   r-  h  s.   
 r-  c                       sV  e Zd ZU eed< def fddZee			ddej	dej	dB dej	dB dej	dB d	e
e d
eeB fddZee			ddej	dejdB dedB dej	dB d	e
e d
eeB fddZe											ddejdB dejdB dej	dB dej	dB dej	dB dejdB dej	dB dedB dedB dededB d
eeB fddZ  ZS )r/  rW   c                    s0  t  | t|jtstdt|j dt|jts(tdt|j dt|j	t
s;tddt|j	 d |j}|j}|j	}|j| _|j| _|j| _|j| _t|| _t|| _t|| _t| j| j| _t| j| j| _tt| jj| _t| j| j| _ t| j| j| _!| "  d S )NzLconfig.text_config is expected to be of type FlavaTextConfig but is of type r   zNconfig.image_config is expected to be of type FlavaImageConfig but is of type zMconfig.multimodal_config is expected to be of type FlavaMultimodalConfig but zis of type )#r]   r^   r   text_configr   	TypeErrortypeimage_configr   multimodal_configr   projection_dimrb   text_hidden_sizeimage_hidden_sizemm_hidden_sizerV  
text_modelr7  image_modelr-  multimodal_modelr   r   image_projectiontext_projectionr`   r5   tensorrW   r2  r1  image_to_mm_projectiontext_to_mm_projectionr@  )r*   rW   r\  r_  r`  rk   r+   r,   r^     sF   


zFlavaModel.__init__Nr   r   r   r   rN  r"   c                 K   s2   | j d||||dd|}|j}| ||_|S )a	  
        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
        ... )
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```
        T)r   r   r   r   r  Nr+   )re  r  ri  rL  )r*   r   r   r   r   rN  text_outputsr  r+   r+   r,   get_text_features  s   &zFlavaModel.get_text_featuresr   r   r   c                 K   s2   | j d||||dd|}|j}| ||_|S )a   
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, FlavaModel
        >>> from transformers.image_utils import load_image

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```
        T)r   r   r   r   r  Nr+   )rf  r  rh  rL  )r*   r   r   r   r   rN  image_outputsr  r+   r+   r,   get_image_features  s   !zFlavaModel.get_image_featuresTimage_attention_maskskip_multimodal_encoderr   r  r  c              	   K   sz  |dur|n| j j}|
stdd}d}d}d}|dur7| j||||	|
|d}|d |d }}| |d }d}d}d}d}|dur_| j|||||	|
|d}|d |d }}| |d }d}d}|dur|dur|s|dur|j\}}}| jj	r|d7 }t
j|||jd	}t
j||gdd
}nd}t
j||gdd
}| j|||d}|d }|s||||||fS t||||||dS )a/
  
        input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
            - 1 for pixel values that are real (i.e., **not masked**),
            - 0 for pixel values that are padding (i.e., **masked**).
        skip_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)

        >>> image_embeddings = outputs.image_embeddings
        >>> text_embeddings = outputs.text_embeddings
        >>> multimodal_embeddings = outputs.multimodal_embeddings

        >>> outputs.image_embeddings.shape
        torch.Size([1, 197, 768])

        >>> text_embeddings.shape
        torch.Size([1, 7, 768])

        >>> multimodal_embeddings.shape
        torch.Size([1, 205, 768])
        ```
        NzRFLAVA model requires hidden states to work. Please set `output_hidden_states=True`)r   r   r   r   r  r  r   rq   rp   )r   r   r   r   r   r  r  r   rX  rv   )r   r  )r   r   r   r   r    r!   )rW   r  r   rf  rk  re  rl  rx   rg  r.  r5   rY  r   r   r   )r*   r   r   r   r   r   r   rq  rr  r   r  r  rN  r   image_statesimage_mm_projectionr   r   text_statestext_mm_projectionr   r    r!   r   r   r   attention_mask_imageattention_multimodalmultimodal_inputr+   r+   r,   r   J  s   F
	zFlavaModel.forwardr   )NNNNNNNNNTN)r1   r2   r3   r   r7   r^   r   r   r5   r   r   r   r/   r   rn  r   rB   rp  
LongTensorr6   r   r   r   r+   r+   rk   r,   r/    s   
 +1,	
r/  c                       s<   e Zd Zdedef fddZdejdejfddZ  ZS )	FlavaImageCodebookResPathin_sizeout_sizec                    s   t    |d }t }t |d< tj||ddd|d< t |d< tj||ddd|d< t |d	< tj||ddd|d
< t |d< tj||ddd|d< t|| _d S )N   relu_1r   r   r   paddingconv_1relu_2conv_2relu_3conv_3relu_4r   conv_4)r]   r^   r   r   ReLUr   
Sequentialpath)r*   r|  r}  rN  hid_sizer  rk   r+   r,   r^     s   
z"FlavaImageCodebookResPath.__init__r   r"   c                 C   
   |  |S rS   )r  r*   r   r+   r+   r,   r        
z!FlavaImageCodebookResPath.forward	r1   r2   r3   r   r^   r5   r   r   r   r+   r+   rk   r,   r{    s    r{  c                       s@   e Zd Zdededef fddZdejdejfdd	Z  ZS )
FlavaImageCodebookBlockr|  r}  
num_layersc                    sP   t    d|d  | _||krtj||ddd| _nt | _t||| _d S )Nr   rq   r   r  )	r]   r^   	post_gainr   r   id_pathIdentityr{  res_path)r*   r|  r}  r  rN  rk   r+   r,   r^     s   

z FlavaImageCodebookBlock.__init__r   r"   c                 C   s   |  || j| |  S rS   )r  r  r  r  r+   r+   r,   r     s   zFlavaImageCodebookBlock.forwardr  r+   r+   rk   r,   r    s    r  c                       sJ   e Zd Zddededededef
 fddZd	ejd
ejfddZ  Z	S )FlavaImageCodebookLayerGroupT
num_blocksr  r|  r}  use_poolc                    s   t    t }t|D ]!}|dkr t||||d|d  < qt||||d|d  < q|r8tjdd|d< t|| _d S )Nr   block_r   rq   )r   pool)	r]   r^   r   r  r  r   	MaxPool2dr  group)r*   r  r  r|  r}  r  blocksr  rk   r+   r,   r^     s   
z%FlavaImageCodebookLayerGroup.__init__r   r"   c                 C   r  rS   )r  r  r+   r+   r,   r     r  z$FlavaImageCodebookLayerGroup.forwardrR  )
r1   r2   r3   r   rB   r^   r5   r   r   r   r+   r+   rk   r,   r    s    $r  a"  
    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
    `get_codebook_indices` to get image tokens for an image.
    c                       s   e Zd ZU dZeed< dZdZdZdede	f fddZ
dejd	ejfd
dZdejd	ejfddZdejd	ejfddZ  ZS )FlavaImageCodebookmodelrW   r   r8  FrN  c                    sd  t  | || _|j| _|j| _|j| _|j| _|j| _| j| j }t }t	
 |d< t	jd| j | jddd|d< t }t	j| jd| j ddd|d	< t| j|d| j d| j |d
< t| j|d| j d| j |d< t| j|d| j d| j |d< t| j|d| j d| j dd|d< t	||d< t	|| _|   | jjr|  D ]}d|_qd S d S )Nrelu   r   r   r  conv   r   inputgroup_1rq   group_2r~  group_3F)r  group_4r   )r]   r^   rW   
num_groupsinput_channelsnum_blocks_per_grouprb   r   r   r   r  r   r  r  r  r@  freeze
parametersrequires_grad)r*   rW   rN  r  output_blocksr  paramrk   r+   r,   r^   $  sB   
zFlavaImageCodebook.__init__r"   c                 C   s*   dt  dt  d | |}tj|ddS )NaI  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("E")
        >>> image_processor = AutoImageProcessor.from_pretrained("a  ")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model.get_codebook_indices(**inputs)
        ```
        r   )axis)_CHECKPOINT_FOR_CODEBOOK_DOCr  r5   argmaxr*   r   z_logitsr+   r+   r,   get_codebook_indicesP  s   
z'FlavaImageCodebook.get_codebook_indicesc                 C   s   |  |}tjdd|S )Nr   rv   )r  r   Softmaxr  r+   r+   r,   get_codebook_probsn  s   
z%FlavaImageCodebook.get_codebook_probsc                 K   sh   dt  dt  d t|jdkrtd|j d|jd | jkr/td|jd  d	| j | |S )
NaJ  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("r  a  ")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model(**inputs)
        >>> print(outputs.shape)
        (1, 196)
        ```
        r~  zinput shape z
 is not 4dr   z
input has z channels but model built for )r  lenrx   r   r  r  )r*   r   rN  r+   r+   r,   r   r  s   
zFlavaImageCodebook.forward)r1   r2   r3   r3  r   r7   rT  r4  r5  r   r^   r5   r   r  r  r6   r   r   r+   r+   rk   r,   r    s   
 ,r  c                       $   e Zd Z fddZdd Z  ZS )FlavaPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S )Nr   )r]   r^   r   r   rb   r   r   r   r   r   transform_act_fnr   r   r   rk   r+   r,   r^     s   
z%FlavaPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S rS   )r   r  r   r   r+   r+   r,   r     s   


z$FlavaPredictionHeadTransform.forwardr1   r2   r3   r^   r   r   r+   r+   rk   r,   r    s    	r  c                       s&   e Zd Zd fdd	Zdd Z  ZS )r)  Nc                    s\   t    || _t|| _tj|j|jdd| _	t
t|j| _|d ur,|| j	_d S d S )NTr   )r]   r^   rW   r  	transformr   r   rb   r   decoderr`   r5   ra   r   weight)r*   rW   r  rk   r+   r,   r^     s   

z"FlavaMaskedPredictionHead.__init__c                 C   r   rS   )r  r  r  r+   r+   r,   r        

z!FlavaMaskedPredictionHead.forwardrS   r  r+   r+   rk   r,   r)    s    	r)  c                       r  )FlavaITMHeadc                    s.   t    || _t|| _t|jd| _d S )Nrq   )	r]   r^   rW   r  r?  r   r   rb   seq_relationshipr   rk   r+   r,   r^     s   

zFlavaITMHead.__init__c                 C   r   rS   )r?  r  r  r+   r+   r,   r     r  zFlavaITMHead.forwardr  r+   r+   rk   r,   r    s    r  c                       r  )FlavaGlobalContrastiveHeadc                    s   t    || _|j| _d S rS   )r]   r^   rW   global_backprop_contrastiver   rk   r+   r,   r^     s   
z#FlavaGlobalContrastiveHead.__init__c                    s2  t |}t j rt j s!t j d jd} g}g}nQ d}t j }	| j	r?t jj
j }t jj
j}n$fddt|	D } fddt|	D }t j|  t j| |t j  t j| jd }t |}t |}t  |dd| }
t |dd| }|
||fS )Nr   rX  c                       g | ]}t  qS r+   r5   
zeros_liker
  )r   r+   r,   r        z6FlavaGlobalContrastiveHead.forward.<locals>.<listcomp>c                    r  r+   r  r
  )r   r+   r,   r    r  r   )r5   expdistributedis_availableis_initializedr   rs   r   get_world_sizer  r   r}   
all_gatherr  get_rankr   r   r   )r*   r   r   r1  temperaturelabelsimage_embeddings_alltext_embeddings_alllocal_batch_size
world_sizelogits_per_imagelogits_per_textr+   )r   r   r,   r     s,   





z"FlavaGlobalContrastiveHead.forwardr  r+   r+   rk   r,   r    s    r  zk
    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
    c                '       s(  e Zd ZdddddZd#dedejdB f fd	d
Zdej	fddZ
e																	d$dejdB dejdB dejdB dejdB dej	dB dej	dB dej	dB dejdB dej	dB dedB dej	dB dej	dB dej	dB dedB dededB dedB d eej	 eB f$d!d"Z  ZS )%FlavaForPreTrainingzmmm_text_head.decoder.biaszmim_head.decoder.biaszmlm_head.decoder.biaszmmm_image_head.decoder.bias)zmmm_text_head.biaszmim_head.biaszmlm_head.biaszmmm_image_head.biasNrW   image_codebookc                    s   t  | t|| _|| _| jdu r|jrt|j| _t|j	| _
t|j| _t|| _t|j	| _t|j| _t|| _|j	j| _|jj| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|j| _|   dS )z
        image_codebook ([`nn.Module`]):
            If passed, the image codebook will be set to this. Otherwise, it will be initialized using the
            image_codebook_config defined in the config first as the first parameter.
        N)r]   r^   r/  r$  r  init_codebookr  image_codebook_configr)  r_  mim_headr\  mlm_headr  itm_headmmm_image_headmmm_text_headr  global_contrastive_headr   image_vocab_sizetext_vocab_size
mlm_weight
mim_weightglobal_contrastive_weightce_ignore_index
itm_weightmmm_image_weightmmm_text_weight skip_unmasked_multimodal_encoderr@  )r*   rW   r  rk   r+   r,   r^     s,   




zFlavaForPreTraining.__init__r   c                 C   s"   |  dkr||dd}|S )Nrq   r   rp   )rw   r   rs   r  r+   r+   r,   _resize_to_2d  s   z!FlavaForPreTraining._resize_to_2dTr   input_ids_maskedr   codebook_pixel_valuesr   r   r   r   rq  r  
mlm_labels
mim_labels
itm_labelsr   r  r  return_lossr"   c           7      K   s  |dur|n| j j}|dur|n| j j}|
dur|
n| j}
|du r,|dur,td |}| j||||||	|
||dd
}| j|||||	|||dd	}d}|j}|j}|j}|j}|j	}d } } } } } } }!d }" }# }$}%d }& }'}(|dus~|dur|du r|r| j
du rtd|du rtd| j
|}| jdkr	|dur	|du r	|})|dur| |}| |}| j||d< |)dd|d	 dddf })|| j}*||* }+|)|*ddf })| |)}"|rtj|"d
| j|+d
}|| j9 }n| |)}"| jdkrj|durj|du rj|},|dure| |}|,dd|d	 dddf },|| j}*||* }-|,|*ddf },| |,}#|rdtj|#d
| j|-d
}|| j9 }n| |,}#| jdkr|dur| |}&|dur|d}.t|.  |.|.!dg}|rtj|&|}!|!| j9 }!|dur|| }|dur|| }|dur|| }|| }|dur-| j"dkr-|})|d	d	 }/|)dddd|/ ddf })|dur(| |}| |}| j||d< || j}*||* }+|)|*ddf })| #|)}%|r'tj|%d
| j|+d
}|| j"9 }n| #|)}%|dur| j$dkr|},|,dd|d	 dddf },|dur| |}|| j}*||* }-|,|*ddf },| %|,}$|rtj|$d
| j|-d
}|| j$9 }n| %|,}$|dur|dur| j&dkr| j'|dddddf }0tjj(|0d
d}0| j)|dddddf }1tjj(|1d
d}1| j*r| jj+j,-t.t/ | 0|1|0| jj+\}'}(}2|dur|'| }'|(| }(|2| }2|rtj|'|2}3tj|(|2}4|3|4 d } | | j&9 } t1|||!| ||d}5|r*|52 s*t3dd |54 D }|s||j5dur9|j56 nd||j7durF|j76 nd|j	|j8durT|j86 nd||j5dura|j56 nd||j7durn|j76 nd||j8dur{|j86 nd|"|#|&|'|'|%|$f}6|r|52 s||5f|6 }6t9dd |6D S t:d&i d|d|5d|d|j5d|d|j7d|j	d|j8d|d|j5d|d|j7d|d|j8d|"d |#d!|&d"|'d#|(d$|%d%|$S )'a  
        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        input_ids_masked (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
        codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
            Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        image_attention_mask (`torch.FloatTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
            in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        skip_unmasked_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
            multimodal embeddings or outputs as of now.
        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
            ..., text_config.vocab_size - 1]`.
        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
            generated automatically using the image codebook assigned to the model. By default, it uses
            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.
        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
        return_loss (`bool`, *optional*, default to None):
            Whether to return calculated loss or not.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import FlavaForPreTraining, AutoProcessor

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> text = ["a photo of a cat"]

        >>> inputs = processor(
        ...     images=[image],
        ...     text=text,
        ...     return_masks=True,
        ...     return_codebook_pixels=True,
        ...     padding=True,
        ...     max_length=77,
        ...     return_tensors="pt",
        ... )


        >>> output = model(**inputs)
        ```
        Nz`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...T)
r   r   r   r   r   rq  rr  r   r  r  )	r   r   r   r   rq  r   r   r  r  z`return_loss` is set to True but the image codebook is not initialized and no `mim_labels`  have been passed. Reinstantiate the model with `init_codebook` set to True or pass in your custom `mim_labels`z`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to Truer   r   rp   rq   rv   )r9   r:   r;   r<   r=   r>   c                 s   s     | ]}|d ur
|ndV  qd S r   r+   )r'   rD   r+   r+   r,   r-   Z  s    z.FlavaForPreTraining.forward.<locals>.<genexpr>c                 s   s    | ]	}|d u r|V  qd S rS   r+   )r'   r   r+   r+   r,   r-   {  r  rD   rE   r   r   r   r   r    r!   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   r+   );rW   rM  r  r  loggerwarningr$  r   r   r    r  RuntimeErrorr   r  r  r  r  ners   r  r   r}   cross_entropyr   r  r  r  r  r  r  r5   whereanynewr  r  r  r  r  ri  	normalizerh  trainingr1  dataclamp_LOGIT_SCALE_CLAMP_MINLOGIT_SCALE_CLAMP_MAXr  r8   r@   sumr?   r   r%   r   r!   r/   rC   )7r*   r   r  r   r  r   r   r   r   rq  r  r  r  r  r   r  r  r  rN  flava_outputflava_masked_outputpos_maskr   r   rF   rH   rJ   
total_lossmim_lossmlm_lossmmm_text_lossmmm_image_lossgc_lossitm_lossrL   rM   rR   rQ   rN   r  r  sequence_for_imagemasked_tokensmim_labels_filteredsequence_for_textmlm_labels_filtered	pos_pairs	end_indextext_embeddingimage_embedding	gc_labelsgc_loss_imagegc_loss_textflava_lossesr   r+   r+   r,   r   #  s  b
 


"


 

"















"




 



	
	
zFlavaForPreTraining.forwardrS   )NNNNNNNNNNNNNNTNN)r1   r2   r3   _tied_weights_keysr   r   rU  r^   r5   r   r  r   rz  r6   rB   r/   rC   r   r   r+   r+   rk   r,   r    s~    #	
r  )r  r  r7  r/  r-  r#  rV  )Jr4   r   r   r   dataclassesr   typingr   r5   r    r   r*  activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   configuration_flavar   r   r   r   r   
get_loggerr1   r  r  r  r  r   r   r8   rC   rU  rV   rd   r   r   r   r   r   r   r  r	  r  r#  r7  rV  r-  r/  r{  r  r  r  r  r)  r  r  r  __all__r+   r+   r+   r,   <module>   s    
		"	\d$6D,'QcQ  #z(   '