o
    ߥiE                    @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
Z
ddlmZ ddl
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z- e . Z/dZ0dZ1dZ2e3e
j4Z5e3dZ6dZ7dZ8e9dZ:g dZ;zddl<m=Z> dZ?G dd de>Z=W n e@y   dZ?Y nw 			dPd d!ZAe7fd"d#ZBd$d% ZCd&d' ZDd(e
jd)e9d*e9fd+d,ZE	dQd-e
jFd.e
jGd/e9fd0d1ZH	dRd2e
jd.e
jGd3ee9 fd4d5ZI		dSd6d7ZJdTd8d9ZKG d:d; d;ejLZMG d<d= d=ejNZOG d>d? d?ejNZPG d@dA dAejNZQG dBdC dCeZReG dDdE dEeZSdFZTdGZUdHZVG dIdJ dJeRZWG dKdL dLeRZXedMeTG dNdO dOeRZYdS )Uz PyTorch OFA model.    N)	dataclass)DictListOptionalTuple)version)Tensornn)
functional)ACT2FN)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forward))BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)logging   )	OFAConfig)utils)ResNet)DropPath)vit_basevit_huge	vit_largevit_large_336ofa-baser   OFATokenizerz1.9.1   g    חA)zofa-tinyz
ofa-mediumr   z	ofa-largezofa-huge)FusedLayerNormTc                       s$   e Zd Zejj fddZ  ZS )r!   c                    sP   |j s	t |S tj|j t |W  d    S 1 s!w   Y  d S N)is_cudasuperforwardtorchcudadevice)selfx	__class__ b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/ofa/modeling_ofa.pyr%   E   s
   
$zFusedLayerNorm.forward)__name__
__module____qualname__r&   jitunusedr%   __classcell__r-   r-   r+   r.   r!   C   s    r!   Fh㈵>c                 C   s<   t j rd}|st j rtrt| ||S t j| ||S )zV
    Layer normalization.
    If apex is available, use `FusedLayerNorm` instead.
    T)	r&   r2   is_scriptingr'   is_availablehas_fused_layernormr!   r	   	LayerNorm)normalized_shapeepselementwise_affineexportr-   r-   r.   r9   Q   s
   
r9   c           
      C   s   t j|t jddddf }t j|t jddddf }|| }t |}| d }t ||k || k@ |d t |}t t || t|d |  |d  | }|	 }t |
||||  }	|	|  d S )z6
    Make relative position indices for the text.
    dtypeN   r   )r&   arangelongsignwhereabsceillogmathintle)
bucket_sizemax_positioncontext_pos
memory_posrelative_posrC   midabs_poslog_pos
bucket_posr-   r-   r.   make_token_bucket_position`   s,   
rT   c                 C   sn  t | }t | }ttkrt t j||gdd}n
t t ||g}t |d}|dddddf |dddddf  }|ddd }|dddddf  | d 7  < |dddddf  | d 7  < |dddddf  d|  d 9  < t j	| |  d fd |j
d}|d|ddddf< |d	 |dddf< |d |dddf< |d |d
< |S )z7
    Make relative position indices for the image.
    ij)indexingr   Nr@   r   )sizer?      )r   r   )r&   rA   TORCH_VERSIONTORCH_MESH_GRID_WARNING_VERSIONstackmeshgridflattenpermute
contiguouszerosr?   sum)rK   num_relative_distancecoords_hcoords_wcoordscoords_flattenrelative_coordsrelative_position_indexr-   r-   r.   make_image_bucket_positionu   s8   

""&rj   c                 G   s2   t |dkr
|  }tj|d | jdj|  S )z
    Return a Tensor of `size` filled with a range function on the device of x.
    If size is empty, using the size of the variable x.
    r   rX   r(   )lenrW   r&   rA   r(   expandr`   )r*   rW   r-   r-   r.   
new_arange   s   rn   	input_idspad_token_iddecoder_start_token_idc                 C   sh   |  | j}| ddddf  |ddddf< ||dddf< |dus*J d||dk| |S )z1
    Shift input ids one token to the right.
    NrX   r   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclonemasked_fill_)ro   rp   rq   shifted_input_idsr-   r-   r.   shift_tokens_right   s   (rw   input_ids_shaper?   past_key_values_lengthc                 C   s   | \}}t ||ftd}t |d}|||d |ddk d ||}|dkr?t jt j	|||d|gdd}|ddddddf 
|d||| S )zC
    Make causal mask used for uni-directional self-attention.
    z-infrX   r   r   r>   dimN)r&   fullfloatrA   rW   ru   viewtocatra   rm   )rx   r?   ry   bsztgt_lenmask	mask_condr-   r-   r.   _make_causal_mask   s   "
r   r   r   c                 C   s^   |   \}}|dur|n|}| ddddddf |d|||}|| t|jS )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr   )rW   rm   r   masked_fillboolr&   finfomin)r   r?   r   r   src_lenexpanded_maskr-   r-   r.   _expand_mask   s   

r   c                 C   s\   t j| ||d}t jj|jd|d d |dur"t j|j| d |r,t j|jd |S )z
    Embedding for tokens
    padding_idxr         ࿩meanstdN)r	   	Embeddinginitnormal_weight	constant_)num_embeddingsembedding_dimr   	zero_initmr-   r-   r.   r      s   r   c                 C   s4   t | ||}t j|j |rt j|jd |S )zH
    Implementation of linear projection with xavier initialization
            )r	   Linearr   xavier_uniform_r   r   bias)in_featuresout_featuresr   r   r-   r-   r.   r      s
   r   c                       s.   e Zd ZdZd fdd	Z fddZ  ZS )LayerDropModuleListz
    A LayerDrop implementation based on :class:`torch.nn.ModuleList`.

    Args:
        p (float): probability of dropping out each layer
        modules (iterable, optional): an iterable of modules to add
    Nc                    s   t  | || _d S r"   )r$   __init__p)r)   r   modulesr+   r-   r.   r      s   
zLayerDropModuleList.__init__c                 #   sJ    t t|  }tt  D ]\}}| jr|| | jkr"|V  qd S r"   )	r&   emptyrl   uniform_	enumerater$   __iter__trainingr   )r)   dropout_probsir   r+   r-   r.   r      s   zLayerDropModuleList.__iter__r"   )r/   r0   r1   __doc__r   r   r4   r-   r-   r+   r.   r      s    r   c                       s   e Zd ZdZ					ddededed	ed
ededef fddZdej	dedefddZ
					ddej	deej	 deeej	  deej	 dedeej	 fddZ  ZS )OFAAttentiona}  
    Multi-headed attention, with additional implementation for NormFormer.

    Args:
        embed_dim (`int`): embedding dimension.
        num_heads (`int`): the number of attention heads.
        dropout (`float32`): the ratio for dropout.
        is_decoder (`bool`): whether or not decoder attention.
        bias (`bool`): whether to add bias.
        scale_heads (`bool`): whether to learn scaling heads, only for Normformer.
        scale_factor (`float32`, *optional*, defaults to `2.0`):
            The position embedding scaling factor. If it works,
            self.scaling = float(self.head_dim * scale_factor)**-0.5
    r   FT       @	embed_dim	num_headsdropout
is_decoderr   scale_headsscale_factorc                    s   t    || _|| _|| _|| | _| j| | jks'J d| j d| dt| j| d | _|| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j
|||d| _t	j|d| _|rpt	jt| jfdd| _d S d | _d S )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   r   r   Trequires_grad)r$   r   r   r   r   head_dimr}   scalingr   r	   r   k_projv_projq_projout_projDropoutattn_dropout	Parameterr&   onesc_attn)r)   r   r   r   r   r   r   r   r+   r-   r.   r     s4   


zOFAAttention.__init__tensorseq_lenr   c                 C   s    | ||| j| jdd S )z;
        Reshape tensors for multi-head attention.
        r   r@   )r~   r   r   	transposer`   )r)   r   r   r   r-   r-   r.   _shape.  s
   zOFAAttention._shapeNhidden_stateskey_value_statespast_key_valueattention_maskoutput_attentions	attn_biasc                 C   s*  |du}|  \}}	}
| || j }|r"|dur"|d }|d }nZ|r9| | |d|}| | |d|}nC|durh| | |d|}| | |d|}tj|d |gdd}tj|d |gdd}n| | |d|}| | |d|}| jr||f}|| j	 d| j
f}| ||	|j| }|j| }|j| }| d}t||dd}|  || j	 |	|fkrtd|| j	 |	|f d|   |dur||7 }|dur	|  |d|	|fkrtd	|d|	|f d|   ||| j	|	|| }||| j	 |	|}tj|dd}|r'||| j	|	|}||| j	 |	|}nd}| |}t||}|  || j	 |	| j
fkrTtd
|| j	|	| j
f d|   ||| j	|	| j
}|dd}|||	|
}| jdur|||	| j	| j
}td|| j}|||	| j}| |}|||fS )a9  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(bsz, tgt_len, embed_dim)`)`: input states.
            key_value_states (`torch.FloatTensor` of shape (bsz, tgt_len, embed_dim), *optional*): key value states.
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*):
                cached past key value states for fast inference.
            attention_mask (`torch.FloatTensor` of shape `(bsz, 1, tgt_len, seq_len)`, *optional*): attention mask.
            output_attentions (`bool`, *optional*): whether to output attention weights of all layers.
            attn_bias (`torch.FloatTensor` of shape `(bsz, 1, tgt_len, src_len)`, *optional*):
                the attention bias for positional information.

        Returns:
            attn_output (`torch.FloatTensor` of shape `(bsz, tgt_len, embed_dim)`): attention outputs.
            attn_weights_reshaped (`torch.FloatTensor`, *optional*): attention weights of all layers.
            past_key_value (`torch.FloatTensor`, *optional*): cached key value states for fast inference.
        Nr   r   rX   r@   rz   z$Attention weights should be of size z	, but is z!Attention mask should be of size z `attn_output` should be of size zbthd,h->bthd)rW   r   r   r   r   r   r&   r   r   r   r   r~   bmmr   
ValueErrorFsoftmaxr   reshaper   einsumr   r   )r)   r   r   r   r   r   r   is_cross_attentionr   r   r   query_states
key_statesvalue_states
proj_shaper   attn_weightsattn_weights_reshaped
attn_probsattn_outputr-   r-   r.   r%   5  s   








zOFAAttention.forward)r   FTTr   )NNNFN)r/   r0   r1   r   rI   r}   r   r   r&   r   r   r   r   r%   r4   r-   r-   r+   r.   r      sR     
r   c                
       sZ   e Zd ZdZddef fddZdd Z			dd
ejdejde	de
ej fddZ  ZS )OFAEncoderLayerz
    OFA encoder layer implementation.

    Args:
        config: configuration for OFA.
        drop_path_rate: the ratio for drop path.
    r   configc                    s8  t    |j| _t| j|j|j|jd| _t	| j| _
|jr$t	| jnd | _t|j| _t|j | _t|j| _t| j|j| _t|j| j| _|jrUt	|jnd | _t	| j| _|j| _|dkrjt|nt | _|j| _| jrt |dd}tj!t"#| j| dd| _$tj!t"#| j| dd| _%d S d S )N)r   r   r   r   r   gamma      ?Tr   )&r$   r   d_modelr   r   encoder_attention_headsattention_dropoutattn_scale_factor	self_attnr9   self_attn_layer_norm
normformerself_attn_mid_layer_normr	   r   r   r   activation_functionactivation_fnactivation_dropoutr   encoder_ffn_dimfc1fc2ffn_layer_normfinal_layer_normencoder_normalize_beforenormalize_beforer   Identity	drop_pathuse_gamma_featuregetattrr   r&   r   weight_self_attn
weight_ffnr)   r   drop_path_rater   r+   r-   r.   r     sX   
zOFAEncoderLayer.__init__c                 C      ||  | S z5
        Residual connection with drop path.
        r   r)   r*   residualr-   r-   r.   residual_connection     z#OFAEncoderLayer.residual_connectionFNr   r   r   r   c           
      C   sT  |}| j r
| |}| j||||d\}}}| jr| |}| |}| jr+| j| }| ||}| j s9| |}|}| j rC| |}| 	| 
|}| |}| jrX| |}| |}| |}| jrj| j| }| ||}| j sx| |}|jtjkrt| st| rt|jjd }tj|| |d}|f}	|r|	|f7 }	|	S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape *(bsz, src_len, embed_dim)*
            attention_mask (`torch.FloatTensor`): attention mask of size
                *(bsz, 1, src_len, src_len)* where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                whether to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            attn_bias (`torch.FloatTensor`): bias for positional information.

        Returns:
            outputs (`tuple(torch.FloatTensor)`):
                output hidden states of size (bsz, src_len, embed_dim), optionally with attention weights.
        )r   r   r   r   i  )r   max)r   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r?   r&   float16isinfanyisnanr   r  clamp)
r)   r   r   r   r   r   r   _clamp_valueoutputsr-   r-   r.   r%     sX   












zOFAEncoderLayer.forwardr   )FN)r/   r0   r1   r   r   r   r  r&   r   r   r   r%   r4   r-   r-   r+   r.   r     s    $	r   c                       s   e Zd ZdZddef fddZdd Z										dd
ejde	ej de	ej de	ej de	e
ej  de	e de	e de	ej de	ej fddZ  ZS )OFADecoderLayerz
    OFA decoder layer implementation.

    Args:
        config: configuration for OFA.
        drop_path_rate: the ratio for drop path.
    r   r   c                    s  t    |j| _t| j|j|jd|jd| _t	j
|jd| _t|j | _t	j
|jd| _t| j| _|jr;t| jnd | _t| j|j|jd|jd| _t| j| _|jrZt| jnd | _t	| j|j| _t	|j| j| _|jrwt|jnd | _t| j| _|j| _|dkrt|nt	  | _!|j"| _"| j"rt#|dd}t	j$t%&| j| dd| _'t	j$t%&| j| dd| _(t	j$t%&| j| dd| _)d S d S )	NT)r   r   r   r   r   r   )r   r   r   r   r   r   r   )*r$   r   r   r   r   decoder_attention_headsr   r   r   r	   r   r   r   r   r   r   r9   r   r   r   
cross_attncross_attn_layer_normcross_attn_mid_layer_normr   decoder_ffn_dimr   r   r   r   decoder_normalize_beforer   r   r   r   r   r   r   r&   r   r   weight_cross_attnr   r   r+   r-   r.   r   0  s|   
zOFADecoderLayer.__init__c                 C   r   r   r   r   r-   r-   r.   r  c  r  z#OFADecoderLayer.residual_connectionNFr   r   encoder_hidden_statesencoder_attention_maskr   r   	use_cacheself_attn_biascross_attn_biasc
                 C   s  |}
| j r
| |}|dur|dd nd}| j|||||d\}}}| jr+| |}| |}| jr8| j| }| ||
}| j sF| |}d}d}|dur|}
| j rX| |}|durb|dd nd}| j	||||||	d\}}}| j
rz| 
|}| |}| jr| j| }| ||
}| j s| |}|| }|}
| j r| |}| | |}| |}| jr| |}| |}| |}| jr| j| }| ||
}| j s| |}|f}|r|||f7 }|r||f7 }|S )a}  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`): input to the layer.
            attention_mask (`torch.FloatTensor` of shape `(bsz, 1, tgt_len, src_len)`):
                attention mask where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, embed_dim)`):
                cross attention input to the layer.
            encoder_attention_mask (`torch.FloatTensor` of shape `(bsz, 1, tgt_len, src_len)`):
                encoder attention mask where padding elements are indicated by very large negative values.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*): whether to return the attentions tensors of all attention layers.
            use_cache (`bool`, *optional*): whether to use cache
            self_attn_bias (`torch.FloatTensor`): self attention bias for positional information.
            cross_attn_bias (`torch.FloatTensor`): cross attention bias for positional information.
        Nr@   )r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r  r  r  r  r  r   r   r   r   r   r   r   )r)   r   r   r  r  r   r   r  r  r  r   self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer  r-   r-   r.   r%   i  s   

















zOFADecoderLayer.forwardr  NNNNFFNN)r/   r0   r1   r   r   r   r  r&   r   r   r   r   r%   r4   r-   r-   r+   r.   r  '  s>    3		
r  c                   @   s.   e Zd ZdZeZdZdZdd Zd
ddZ	d	S )OFAPreTrainedModelz
    Base class OFA
    modelTc                 C   s   | j j}t|tjr"|jjjd|d |jdur |jj	  dS dS t|tj
rA|jjjd|d |jdurC|jj|j 	  dS dS dS )z;
        Weight initialization which follows BERT.
        r   r   N)r   init_std
isinstancer	   r   r   datar   r   zero_r   r   )r)   moduler   r-   r-   r.   _init_weights  s   

z OFAPreTrainedModel._init_weightsFc                 C   s   t |ttfr||_dS dS )z?
        Turn on the switch of gradient checkpointing.
        N)r%  
OFADecoder
OFAEncodergradient_checkpointing)r)   r(  valuer-   r-   r.   _set_gradient_checkpointing  s   
z.OFAPreTrainedModel._set_gradient_checkpointingN)F)
r/   r0   r1   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr)  r.  r-   r-   r-   r.   r"    s    r"  c                   @   sl   e Zd ZU dZdZejed< dZej	ed< dZ
eeej  ed< dZeeej  ed< dZeej ed< dS )OFAEncoderOutputa  
    Base class for OFA's outputs.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`):
            Sequence of hidden-states at the output of the last layer of the model.

        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed
            or when `config.output_hidden_states=True`):

            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(bsz, seq_len, hidden)`.
            Hidden-states of the model at the output of each layer plus the initial embedding outputs.

        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed
            or when `config.output_attentions=True`):

            Tuple of `torch.FloatTensor` (one for each layer) of shape `(bsz, num_heads, seq_len, seq_len)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

        position_embedding (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`):
            postional embeddings of the inputs.
    Nlast_hidden_statepadding_maskr   
attentionsposition_embedding)r/   r0   r1   r   r3  r&   FloatTensor__annotations__r4  r   r   r   r   r5  r6  r-   r-   r-   r.   r2    s   
 r2  aI  
    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
    etc.)

    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
    and behavior.

    Parameters:
        config ([`~OFAConfig`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
aW  
    Image captioning example:

    ```python
    >>> from PIL import Image
    >>> from torchvision import transforms
    >>> from transformers import OFATokenizer, OFAForConditionalGeneration

    >>> mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
    >>> resolution = 256
    >>> patch_resize_transform = transforms.Compose([
            lambda image: image.convert("RGB"),
            transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
            transforms.ToTensor(),
            transforms.Normalize(mean=mean, std=std)
        ])

    >>> model = OFAForConditionalGeneration.from_pretrained(ckpt_dir)
    >>> tokenizer = OFATokenizer.from_pretrained(ckpt_dir)

    >>> txt = " what is the description of the image?"
    >>> inputs = tokenizer([txt], max_length=1024, return_tensors="pt")["input_ids"]
    >>> img = Image.open(path_to_image)
    >>> patch_img = patch_resize_transform(img).unsqueeze(0)

    >>> gen = model.generate(inputs, patch_img=patch_img, num_beams=4)
    >>> print(tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=False))
    ```
a  
    Args:
        input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`):
            indices of input sequence tokens in the vocabular, and padding will be ignored by default;

            indices can be obtained using [`~OFATokenizer`].

        patch_images (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
            the resized image, which are transformed by the default operations.
        patch_images_2 (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
            the second (if it exists) image.
        patch_masks (`torch.BoolTensor`): the patches to be masked.
        token_embeddings (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`): token embeddings.
        sample_patch_num (`int`): the number of patches to sample.
        decoder_input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`): indices of the sequence in the vocabulary.
        code_masks (`torch.Tensor` of shape `(bsz, seq_len)`): masks only for code generation.
        attention_mask (`torch.Tensor` of shape `(bsz, seq_len)`): attention mask for decoding.
        encoder_outputs (`OFAEncoderOutput`):
            encoder outputs with hidden states, positional embeddings, and padding masks.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
            shape `(bsz, num_heads, tgt_len, head_size)`) and 2 additional tensors of
            shape `(bsz, num_heads, src_len, head_size)`.
        use_cache (`bool`): whether to use cache for faster inference.
        output_attentions (`bool`): whether to output attention weights.
        output_hidden_states (`bool`): whether to output hidden states.
        return_dict (`bool`): unused. Keep it for generation only.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
c                       s  e Zd ZdZ	d%dedeej f fddZdd Z	d	d
 Z
dd Zdd Zdd Z						d&deej deej deej deej deej deej fddZdd Z								d'deej deej deej ded ed!eej d"ee fd#d$Z  ZS )(r+  z
    OFA encoder consisting of layers of [`OFAEncoderLayer`].

    Args:
        config: OFAConfig
        embed_tokens (`nn.Embedding`, *optional*): output embedding
    Nr   embed_tokensc           	         s:  t    t j_ j_ j} j_ j	_
 jr$t|nd_ j_t ddr7t|_nd _|d urB|_n
t j|j_ jrd jr[td|d d_ntd|d d_nd _ jrވ jrttttd j }| j _!t"j!j#|_$nX j%dkrt&g d	 j'd
_!nB j%dkrt&g d j'd
_!n2 j%dkrt&g d j'd
_!n" j%dkrt&g d j'd
_!n j%dkrt&g d j'd
_!nt(t"d|_$ js j)rt*d+ j) t,- j)}j!.|  j/rt|_/nd _/tj
d |_0 jrt j1d d |_2 js&t|_3 jr/t|_4t5|j  j6 d _7 jrD j8sRt"||_9t"||_:jdkr`t;jd_<nt=g _<dd t,>d j? j@D j<A fddtB j@D  tCj<_D jErt|_Fnd _F jG_Gd jG d tH jG} jI_I jIrdn j@}t=fddtB|D _J jr j1_1d j1 d d j1 d  d tK j1}t=fddtB|D _LMd|  jrt|_nd _Md|  j8_8d_NO   j_d S ) Nr   layernorm_embeddingFr@   r   r   )r   r   r   r   resnet18)r@   r@   r@   r   resnet34)rY         resnet50	resnet101)rY   r>     	resnet152)rY      $   r    zload resnet {}r   r   r   c                 S      g | ]}|  qS r-   item.0r*   r-   r-   r.   
<listcomp>      z'OFAEncoder.__init__.<locals>.<listcomp>r   c                       g | ]
}t  | d qS r<  )r   rJ  r   r   dprr-   r.   rK        c                       g | ]
}t  jd dqS T)r   r   num_attention_headsrJ  r	  r)   token_num_rel_disr-   r.   rK        rY   c                       g | ]
}t  jd dqS rT  rU  rW  image_num_rel_disr)   r-   r.   rK        image_rp_buckettoken_rp_bucket)Pr$   r   r	   r   r   encoder_layerdropr   rp   r   max_position_embeddingsmax_source_positionsscale_embeddingrH   sqrtembed_scaler   rV  r   r9   r:  r9  r   
vocab_sizeadd_type_embeddinguse_image_featuretype_embedding
use_ofasysr   r   r   r   vit_typevit_drop_path_rateembed_imagesr   width
image_projresnet_typer   resnet_drop_path_rateNotImplementedErrorresnet_model_pathprintformatr&   loadload_state_dictpatch_layernorm_embeddingembed_positionsimage_bucket_sizeembed_image_positionspos_lnimage_pos_lnr}   r   pos_scalingentangle_position_embeddingpos_q_linearpos_k_linearr   layers
ModuleListlinspaceencoder_drop_path_rateencoder_layersextendrangerl   
num_layersr   
layer_normtoken_bucket_sizerT   share_attn_biastoken_rel_pos_table_listrj   image_rel_pos_table_listregister_bufferr,  	post_init)	r)   r   r9  r   vit_backboneresnet_state_dictr`  num_rel_pos_tablesr_  r+   r   rQ  r]  r)   rY  r.   r   o  s  





















zOFAEncoder.__init__c                 C      | j S )z+
        Get the embedding weight.
        r9  r)   r-   r-   r.   get_input_embeddings
     zOFAEncoder.get_input_embeddingsc                 C   
   || _ dS )zD
        Set the weight of embedding with the given tensor.
        Nr  r)   r-  r-   r-   r.   set_input_embeddings     
zOFAEncoder.set_input_embeddingsc                 C   sf   | d}| jd|d|f }t|| j| j}|d| dddd}|g d}|	 S )N
        Get the relative positional bias of the text, for attention.
        r   Nr   rX   )r   rY   r   r@   )
rW   r`  r   	embeddingr  r   	unsqueezerm   r_   r`   r)   r*   idxr   	rp_bucketvaluesr-   r-   r.   get_rel_pos_bias  s   

zOFAEncoder.get_rel_pos_biasc                 C   s   |j \}}| jd}| jd|||d|dddddf |||d|dddddf |||}t|| j| j	}|
dddd}|S )O
        Get the relative positional bias of the image, for attention.
        r   r   Nr@   rY   )rs   r_  rW   r  rm   gatherr   r  r  r   r_   )r)   image_position_idsr  r   r   rp_bucket_sizer  r  r-   r-   r.   get_image_rel_pos_bias#  s*   

z!OFAEncoder.get_image_rel_pos_biasc              
      s  |  |}|jdd \}}||  ||d f }t|d||t|d| j	  d }|
d|}|dddf |d }	|ddd}dur fddt|dD }
t|
|}
|d|
ddd|d} |d|
}|	d|
}	| jjd	 d }| jjd	 }| jjr |krt|d||t|d| jj	  d }||}| |}|d||ddd
dd}tj|||fdd}|ddd
dd d}||ddd}n| |	}| ||	|fS )a_  
        Get the basic information of the resized image.

        Args:
            patch_images (`torch.FloatTensor` of shape `(bsz, 3, height, width)`): the resized image.
            sample_patch_num (`int`):
                the number of patches to sample. If it is equal to -1, no sampling will be performed.
            device: GPU device.

        Returns:
            image_embed (`torch.FloatTensor` of shape `(bsz, h * w, hidden)`): the output of the visual encoder.
            image_num_patches (`int`, equal to `h * w`): the number of patches.
            image_padding_mask (`torch.BooleanTensor` of shape `(bsz, h*w)`): image padding mask.
            image_position_ids (`torch.LongTensor` of shape `(bsz, h*w)`): image position ids.
            image_pos_embed (`torch.FloatTensor` of shape (bsz, h*w, hidden)): the positional embedding.
        r  Nr   r   rX   r@   c                    s   g | ]}t jt d qS ))k)randomsampler  rW  image_num_patchessample_patch_numr-   r.   rK  U  s    z4OFAEncoder.get_patch_images_info.<locals>.<listcomp>   rY   bilinear)rW   mode)rn  rs   rr   rW   r   r&   rA   r  rm   r{  r~   r   r^   r   r  
LongTensorr  r   orig_patch_image_sizeinterpolate_positionr|  r   r_   r   interpolate)r)   patch_imagesr  r(   image_embedhwimage_padding_maskimage_position_idxr  patch_ordersorig_num_patchesorig_hwold_image_position_idsold_image_pos_embedimage_pos_embedr-   r  r.   get_patch_images_info6  sx   




z OFAEncoder.get_patch_images_infor  image_embed_2token_embedding	pos_embedr  image_pos_embed_2c                 C   s  |du r	|  |}| j|  }}	| jr|dur||7 }| jdur0|| || dd 7 }| jdur:| |}| |}|dur| |}| j|  }
}| jrZ|durZ|
|7 }
| jduro|
| |	|
 dd 7 }
| j
dury| 
|
}
| |
}
tj|
|gdd}tj||	gdd}	|dur| jdusJ | |}| j|  }}| jr|dur||7 }| jdur|| |j| dd dd7 }| j
dur| 
|}| |}| jdur| |}tj||gdd}tj||	gdd}	||	fS )a6  
        Generate embeddings of both the image and the text.
        Actually since OFA unifies both unimodal and multimodal data,
        image inputs are optional.

        Args:
            input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`): indices of the tokens in the vocabulary.
            image_embed (`torch.FloatTensor` of shape `(bsz, h*w, embed_dim)`, *optional*): image embeddings.
            image_embed_2 (`torch.FloatTensor` of shape `(bsz, h*w, embed_dim)`, *optional*):
                image embeddings of the second image (if it exists).
            token_embedding (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`, *optional*):
                input token embeddings to replace the embeddings of input ids.
            image_pos_embed (`torch.FloatTensor` of shape `(bsz, h*w, embed_dim)`, *optional*):
                positional embeddings of the image.
            image_pos_embed_2 (`torch.FloatTensor` of shape `(bsz, h*w, embed_dim)`, *optional*):
                positional embeddings of the second image.

        Returns:
            x (`torch.FloatTensor` of shape `(bsz, h*w+seq_len, embed_dim)`): embeddings of the input.
            embed (`torch.FloatTensor` of shape `(bsz, h*w+seq_len, embed_dim)`):
                embeddings without adding positional and type embeddings.
        Nr@   r   rz   )
fill_value)r9  rf  r  rj  rr   rW   r:  r   rp  new_onesry  r&   r   new_fullquant_noise)r)   ro   r  r  r  r  r  r  r*   embedimage_x	image_x_2r-   r-   r.   forward_embeddingv  sT    

 














zOFAEncoder.forward_embeddingc                 C   s   d|vrd}n|d  d|}d|vrd}n|d  d|}d|vr%d}n|d  d|}d|vr4d}n|d }d}t|dkrRt|D ]\}}	||	 d|f7 }qDd|vrYd}
n|d }
t||||
|d	S )
a  
        Reorder encoder output according to *new_order*.

        Args:
            encoder_out: output from the ``forward()`` method
            new_order (LongTensor): desired order

        Returns:
            *encoder_out* rearranged according to *new_order*
        r3  Nr   r4  r6  r   r-   r5  r3  r4  r   r5  r6  )index_selectrl   r   r2  )r)   encoder_out	new_ordernew_encoder_outnew_encoder_padding_masknew_position_embeddingsnew_encoer_statesencoder_statesr  stater5  r-   r-   r.   reorder_encoder_out  sH   zOFAEncoder.reorder_encoder_outFr  patch_images_2patch_masksr   output_hidden_statestoken_embeddingsr  c	           #   
      s  d}	d}
d}d}|dur  |||j\}	}}}}d|| < |dur4  |||j\}
}}}}d|| < | j}|durGtj||gdd}|durTtj||gdd}| } t|} 	||	|
||||\}}|rz|d|
d|  } jr|durtj||gdd}|durtj||gdd}n) |}|dur |}tj||gdd}|dur |}tj||gdd} fdd}||}|rt||jd}|rd	nd}|rd	nd}t jD ]\}}|r||f7 }| } jrd
n|} |dddd|d d|d df   || 7  < |durU|ddddd|d|f   || 7  < |dddd||| ||| f   || 7  < n-|dur|ddddd|d|d d|d|d f   || 7  < |d|d|d}|||r|nd||d}!|!d
 }|r|!d }"||"f }q|r||f7 } jdur |}t|||||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`):
                indices of input sequence tokens in the vocabular, and padding will be ignored by default;

                indices can be obtained using [`~OFATokenizer`].

            patch_images (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
                the resized image, which are transformed by the default operations.
            patch_images_2 (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
                the second (if it exists) image.
            patch_masks (`torch.BoolTensor`): the patches to be masked.
            output_attentions (`bool`): whether to return all attention weights,
            output_hidden_states (`bool`): whether to return all hidden states.
            token_embeddings (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`): token embeddings.
            sample_patch_num (`int`): the number of patches to sample.

        Returns:
            [`OFAEncoderOutput`]:
                last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    the states of the last layer.
                padding_mask (`torch.BoolTensor` of shape `(bsz, seq_len)`):
                    the padding mask of the source context.
                hidden_states (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    the states of all layers including the embeddings.
                attentions (`torch.FloatTensor` of shape `(bsz, num_heads, seq_len, seq_len)`):
                    the attention weights of all layers.
                position_embedding (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    positional embeddings of the input image and tokens.
        NTr   rz   rX   c                    s   |  d|  d}} jr js@ | || jddd j } | || jddd}t	
||dd}|S t	j| j||| j| jd}|S )Nr   r   rX   r@   rY   r?   r(   )rW   rk  r  r  r~   rV  r   r  r  r&   matmulra   r?   r(   )r  
batch_size
seq_lengthpos_qpos_kabs_pos_biasr  r-   r.   build_abs_pos_biasN  s2   

	z.OFAEncoder.forward.<locals>.build_abs_pos_biasr>   r-   r   )r   r   r  )r  r(   eqr   r&   r   r  rz  rn   r  r  type_asrk  r}  r~  r   r?   r   r  rt   r  rW   r  r  r   r  r2  )#r)   ro   r  r  r  r   r  r  r  r  r  r  r  r  r  r  image_num_patches_2image_padding_mask_2image_position_ids_2encoder_padding_maskhas_padsr  r*   encoder_embeddingr  r  r   r  all_attentionsr  layerr  real_idxhidden_outputs	attentionr-   r  r.   r%     s   )







 




@



zOFAEncoder.forwardr"   )NNNNNNr!  )r/   r0   r1   r   r   r   r	   r   r   r  r  r  r  r  r&   r   r  r  r   rI   r%   r4   r-   r-   r+   r.   r+  f  sr    
 B
M7	r+  c                       sr  e Zd ZdZ		d.dedeej f fddZdd Z	d	d
 Z
dd Zd/ddZdd Zdd Zdd Zdd Z	d0deeeeeeee  f  f dedeeeef  fddZ	d0deeeeeeee  f  f dedeeeef  fddZdeej d efd!d"Z										d1d#ejd$ejd%ejd&ejd'eej d(ejdeej d)ed*ed+efd,d-Z  ZS )2r*  z
    OFA decoder consisting of layers of [`OFADecoderLayer`]

    Args:
        config: OFAConfig
        embed_tokens (`nn.Embedding`, *optional*): output embedding
    Nr   r9  c                    s  t    t j_ j_ j_ j_	 j
r"t jnd_td_ j_ j_ j_ j_|d urC|_nt j jj_ j_ j_t fddt jD _  j!rpt"j_!nd _! jr j#rtdjd d_$nd _$ j%d _&tj	d j_' jst j(d d j_) jst"j_*t"j_+t,jj  j- d	 _. jrȈ j/st0jj_1t0jj_2t0jj_3t0jj_4 j5rt"j_5nd _5jd
krt6jd_ ntg _ dd t7d j8 jD j 9 fddt jD  t:j _; j<r=t"j_=nd _=d _>|_?j?d u rQ@   jA_Ad jA d tB jA} jC_C jCrkdn j}tfddt|D _D jEr js j(_(d j( d d j( d  d tF j(}tGj&HdIj&j&tGj&Hd j(  d }tJtKdg|Ldg}tJ|tKdgd g}Md| tfddt|D _NMd| Md|  j/_/d_OP  d S )Nr   r   c                    s   g | ]}t  qS r-   r  rW  )r   r-   r.   rK    s    z'OFADecoder.__init__.<locals>.<listcomp>r   r   rD  r@   r   r   r   c                 S   rF  r-   rG  rI  r-   r-   r.   rK    rL  c                    rM  rN  r  rO  rP  r-   r.   rK    rR  c                    rS  rT  rU  rW  rX  r-   r.   rK    rZ  rY   rX   r    i   r  c                    r[  rT  rU  rW  r\  r-   r.   rK    r^  r_  r`  F)Qr$   r   r	   r   r   decoder_layerdroprp   r   rb  max_target_positionsrd  rH   re  r   rf  r&   r   _future_mask share_decoder_input_output_embedshare_input_output_embedr  rV  rk  disable_entangler9  r   rg  r   output_embed_dimr  r  decoder_layersr  r:  r9   rh  rj  code_image_sizewindow_sizerz  r{  r|  r}  r~  r}   r   r  r  r   self_pos_q_linearself_pos_k_linearcross_pos_q_linearcross_pos_k_linearcode_layernorm_embeddingr   r  decoder_drop_path_rater  rl   r  r  r  adaptive_softmaxoutput_projectionbuild_output_projectionr  rT   r  r  ri  rj   rA   r  rm   r   r   r~   r  r  r,  r  )r)   r   r9  r  r`  r  r_  r  r+   r  r.   r     s   







zOFADecoder.__init__c                 C   sr   | j rtj| jjjd | jjjd dd| _| jj| j_d S tj| j|jdd| _tj	j
| jjd| jd d d S )Nr   r   Fr   r   r   )r  r	   r   r9  r   rs   r  r  rg  r   r   )r)   r   r-   r-   r.   r	  -  s   

z"OFADecoder.build_output_projectionc                 C   sJ   | d}| jd|d|f }t|| j| j}|g d}| S )r  r   N)r@   r   r   )rW   r`  r   r  r  r   r_   r`   r  r-   r-   r.   r  =  s   

zOFADecoder.get_rel_pos_biasc                 C   sT   | d}| jd| }| j| dd|f }t|| j| j}|ddd}|S )r  r   Nr@   r   )rW   r  r_  r   r  r  r   r_   )r)   r*   r  r   r  r  r  r-   r-   r.   r  I  s   


z!OFADecoder.get_image_rel_pos_biasFc           
      C   sX  | d}| d}| js|r| |n| |}|durg| d}| jr(| jsW| |||| jddd| j	 }| 
|||| jddd}t||dd}	|	S tj|| j|||j|jd}	|	S | jrm| js| |||| jddd| j	 }| |||| jddd}t||dd}	|	S tj|| j|||j|jd}	|	S )aB  
        Get the positional information.

        Args:
            tgt_pos_embed (`torch.FloatTensor` of shape `(bsz, tgt_len, embed_dim)`):
                the target-side positional embeddings.
            src_pos_embed (`torch.FloatTensor` of shape `(bsz, src_len, embed_dim)`, *optional*):
                the source-side positional embeddings.
            use_image (`bool`): whether to use image.

        Returns:
            abs_pos_bias (`torch.FloatTensor` of shape `(bsz, src_len, tgt_len, src_len)`):
                absolute positional bias for attention.
        r   r   NrX   r@   rY   r  )rW   rk  r~  r}  r  r  r~   rV  r   r  r  r&   r  ra   r?   r(   r  r  )
r)   tgt_pos_embedsrc_pos_embed	use_imager  r   r   r  r  r  r-   r-   r.   get_pos_infoW  sv   







zOFADecoder.get_pos_infoc                 C   r  )z*
        Get the input embeddings
        r  r  r-   r-   r.   r    r  zOFADecoder.get_input_embeddingsc                 C   r  )zJ
        Set the weights of the embeddings with the given tensor.
        Nr  r  r-   r-   r.   r    r  zOFADecoder.set_input_embeddingsc                 C   sX   d}|d dkrt |||d| j}|dur*t|||d d}|du r&|n|| }|S )z~
        Create causal mask for unidirectional decoding.
        [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
        NrX   r   )ry   )r   )r   r   r(   r   )r)   r   input_shaper?   ry   combined_attention_maskexpanded_attn_maskr-   r-   r.   _prepare_decoder_attention_mask  s"   
z*OFADecoder._prepare_decoder_attention_maskc                 C   s   | j du r| jS | jS )z/Maximum output length supported by the decoder.N)rz  r  r  r-   r-   r.   max_positions  s   
zOFADecoder.max_positions
net_output	log_probsr  c                 C      |  |||S @Get normalized probabilities (or log probs) from a net's output.get_normalized_probs_scriptabler)   r  r  r  r-   r-   r.   get_normalized_probs     zOFADecoder.get_normalized_probsc                 C   s   t | dr-| jdur-|durd|v sJ |d }nd}| jj|d |d}|s+| S |S |d }|r:tj|ddS tj|ddS )r  r  Ntargetr   )r  rX   rz   )hasattrr  get_log_probexp_r   log_softmaxr   )r)   r  r  r  r  outlogitsr-   r-   r.   r    s    

z*OFADecoder.get_normalized_probs_scriptablepast_key_valuesr  c                 C   sZ   |}g }|dur+|D ] }g }|D ]}|du rd}n| d|}|| q|| q
|S )a	  Main entry point for reordering the incremental state.

        Due to limitations in TorchScript, we call this function in
        :class:`fairseq.sequence_generator.SequenceGenerator` instead of
        calling :func:`reorder_incremental_state` directly.
        Nr   )r  append)r)   r$  r  input_buffernew_past_key_valuesinput_buffer_knew_input_buffer_kinputr-   r-   r.   #reorder_incremental_state_scripting  s   z.OFADecoder.reorder_incremental_state_scriptingro   r   r  r  
code_masksr  r  r   r  c           &      C   s<  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|durKt|dkrK|d d  }|d |d d }}tj||jd	||g
 }n	|j\}}t|}| |}|dur|t|r|| jd|d d	||}| || ||< | j|dd}|durt|r| j|dd}|| ||< | j||d	}|durt|r| j||dd
}|| ||< |jdg| dd R  }| }|durt|dkr|ddddf }|ddddddf }|ddddddf }| j| | }| jr| js||7 }| jdurJ|du s | r | js&| |}n$|dur6| r6| |}n| ||  || < | || ||< | |}|durdt|dkrd|d d jd nd}|j|j}}| ||||}|
rzdnd}|	rdnd}|	r|durdnd}|rdnd}t| j D ]\}} |
r||f7 }|durt|dkr|| nd}!| }"| j!rdn|}#|du s| s|"| "||#d7 }"n5|dur| r|"| #||#d7 }"n|"|   | "||#d7  < |"|  | #||#d7  < |"jdg|" dd R  }"|!dur8t|dkr8|"ddddddf }"| |||||!|	||"|d	}$|$d }|rX||$|	rSdnd f7 }|	rn||$d f7 }|durn||$d f7 }q|
rx||f7 }|r}|nd}%| j$dur| $|}| j%dur| %|}t&||%|||dS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`): indices of the sequence in the vocabulary.
            attention_mask (`torch.Tensor` of shape `(bsz, seq_len)`): mask to avoid attention on padding tokens.
            encoder_hidden_states (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`): the last hidden state of the encoder.
            encoder_attention_mask (`torch.Tensor` of shape `(bsz, seq_len)`): the padding mask of the source side.
            code_masks (`torch.Tensor` of shape `(bsz, seq_len)`): masks only for code generation.
            src_pos_embed (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`): the positional embeddings of the source side.
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(bsz, num_heads, tgt_len, head_size)`) and 2 additional tensors of
                shape `(bsz, num_heads, src_len, head_size)`.
            use_cache (`bool`): whether to use cache for faster inference.
            output_attentions (`bool`): whether to output attention weights.
            output_hidden_states (`bool`): whether to output hidden states.

        Returns:
            BaseModelOutputWithPastAndCrossAttentions or a plain tuple:
                last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`): the last hidden states.
                past_key_values (`tuple(tuple(torch.FloatTensor)): past keys and values for faster inference.
                hidden_states (`tuple(torch.FloatTensor)`): hidden states of all layers.
                attentions (`tuple(torch.FloatTensor)): self attention weights of all layers.
                cross_attentions (`tuple(torch.FloatTensor)): cross attention weights of all layers.
        Nr   r  r   rk   F)r  T)r  )r  r  rX   r@   r-   )r   r  r  r   r   r  r  r  rY   )r3  r$  r   r5  cross_attentions)'r   r   r  r  rl   rW   r&   rA   r(   rm   r`   rs   rn   rz  r  r  r  r|  r  r   rt   rf  r9  r  r  r:  r  allr   r?   r  r   r  r  r  r  r  r  r   )&r)   ro   r   r  r  r,  r  r$  r  r   r  rW   r   r   token_position_idxr
  r  self_abs_pos_biasself_image_abs_pos_biascross_abs_pos_biascross_image_abs_pos_biasall_prev_output_tokensr*   r   ry   rs   r?   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cacher  r  r   r  r  layer_outputs
next_cacher-   r-   r.   r%     sR  &









zOFADecoder.forward)NNNFr"   )
NNNNNNNFFF)r/   r0   r1   r   r   r   r	   r   r   r	  r  r  r  r  r  r  r  r   r   r   strr   r   r  r  r&   r+  r%   r4   r-   r-   r+   r.   r*    s    
 	
=



	
r*  zQThe bare OFA Model outputting raw hidden-states without any specific head on top.c                       s  e Zd ZdZdef fddZdd Zdd Zd	d
 Zdd Z	e
eeeeeeddd Z	d.deeeeeeee  f  f dedeeeef  fddZ	d.deeeeeeee  f  f dedeeeef  fddZ															d/ddZ						d0ddZdejfddZ	d.d ejd!ee fd"d#Ze d$d% Z!e 	&			d1d'ej"d(e#d)ed*eej" d+ee$ f
d,d-Z%  Z&S )2OFAModelz
    The OFA model built with an encoder and a decoder only, without any classification head.

    Args:
        config (OFAConfig): OFA configuration.
    r   c                    s   t  | t|dd| _|j|j| _}t||j	| j}t
||| _t||| _|j| _t|dds<t|j	|j| _|jrKt|jtg  | _|   d S )Nr  Fexclude_mlpT)r$   r   r   r  rp   rg  r   r	   r   r   r+  encoderr*  decoderrk  r   mlp_dimmlp_headtemperature_init_valuer   r&   r   tempr  )r)   r   kwargsrg  sharedr+   r-   r.   r     s   zOFAModel.__init__c                 C   
   | j  S )z,
        Retrieve input embeddings.
        )r?  r  r  r-   r-   r.   r    r  zOFAModel.get_input_embeddingsc                 C   s   |}|| j _|| j_dS )z1
        Set values for input embeddings
        N)r?  r9  r@  )r)   r-  rF  r-   r-   r.   r    s   zOFAModel.set_input_embeddingsc                 C   r  )z&
        Retrieve the encoder
        )r?  r  r-   r-   r.   get_encoder  r  zOFAModel.get_encoderc                 C   r  )z&
        Retrieve the decoder
        )r@  r  r-   r-   r.   get_decoder  r  zOFAModel.get_decoder)processor_class
checkpointoutput_typer/  c                 C   rG  )z(Maximum length supported by the decoder.)r@  r  r  r-   r-   r.   max_decoder_positions  s   

zOFAModel.max_decoder_positionsNr  r  r  c                 C   r  r  r  r  r-   r-   r.   r    r  zOFAModel.get_normalized_probsc                 C   sP   t | dr| j|||S t|r&| }|rtj|ddS tj|ddS t	)zHScriptable helper function for get_normalized_probs in ~BaseFairseqModelr@  rX   rz   )
r  r@  r  r&   	is_tensorr}   r   r!  r   rs  )r)   r  r  r  r#  r-   r-   r.   r    s   


z(OFAModel.get_normalized_probs_scriptableFc                 C   s   |r|n| j j}|r|n| j j}|dur|n| j j}|
du r+| j||||||||d}
|| j j r:|| j}	|
j	}t
|
j|j|jd }|
j}| j||	||||||||d
}t|j	|j|j|j|j|
j	|
j|
jdS )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`):
                indices of input sequence tokens in the vocabular, and padding will be ignored by default;

                indices can be obtained using [`~OFATokenizer`].

            patch_images (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
                the resized image, which are transformed by the default operations.
            patch_images_2 (`torch.FloatTensor` of shape `(bsz, 3, height, width)`):
                the second (if it exists) image.
            patch_masks (`torch.BoolTensor`): the patches to be masked.
            token_embeddings (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`): token embeddings.
            sample_patch_num (`int`): the number of patches to sample.
            decoder_input_ids (`torch.LongTensor` of shape `(bsz, seq_len)`): indices of the sequence in the vocabulary.
            code_masks (`torch.Tensor` of shape `(bsz, seq_len)`): masks only for code generation.
            attention_mask (`torch.Tensor` of shape `(bsz, seq_len)`): attention mask for decoding.
            encoder_outputs (`OFAEncoderOutput`):
                encoder outputs with hidden states, positional embeddings, and padding masks.
            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(bsz, num_heads, tgt_len, head_size)`) and 2 additional tensors of
                shape `(bsz, num_heads, src_len, head_size)`.
            use_cache (`bool`): whether to use cache for faster inference.
            output_attentions (`bool`): whether to output attention weights.
            output_hidden_states (`bool`): whether to output hidden states.
            return_dict (`bool`): unused. Keep it for generation only.

        Returns:
            Seq2SeqModelOutput:
                last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, hidden)`): the last decoder hidden states.
                past_key_values (`tuple(tuple(torch.FloatTensor)): past keys and values for faster inference.
                decoder_hidden_states (`tuple(torch.FloatTensor)`): the decoder hidden states of all layers.
                decoder_attentions (`tuple(torch.FloatTensor)): the decoder self attention weights of all layers.
                cross_attentions (`tuple(torch.FloatTensor)): cross attention weights of all layers.
                encoder_last_hidden_state (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    the encoder last hidden state.
                encoder_hidden_states (`torch.FloatTensor` of shape `(bsz, seq_len, embed_dim)`):
                    the encoder states of all layers including the embeddings.
                encoder_attentions (`torch.FloatTensor` of shape `(bsz, num_heads, seq_len, seq_len)`):
                    the encoder attention weights of all layers.
        N)ro   r  r  r  r   r  r  r  rX   )
ro   r   r  r  r,  r  r$  r  r   r  )r#  r$  decoder_hidden_statesdecoder_attentionsr-  encoder_last_hidden_stater  encoder_attentions)r   r   r  r  r?  r  rp   r  r   r3  r   r4  r?   rs   r6  r@  r   r$  r   r5  r-  )r)   ro   r  r  r  r  r  decoder_input_idsr,  r   encoder_outputsr$  r  r   r  return_dictr  r  r  decoder_outputsr-   r-   r.   r%   $  s^   ;zOFAModel.forwardc                 K   sF   | |j}|d ur|d d dd f }d d d d d d ||||||dS )NrX   )ro   r  r  r  r  r  r   rT  r$  rS  r,  r  )rr   rs   )r)   rS  pastr   r,  r  rT  rE  r-   r-   r.   prepare_inputs_for_generation  s    	z&OFAModel.prepare_inputs_for_generationlabelsc                 C   s   t || jj| jjS r"   )rw   r   rp   rq   )r)   rY  r-   r-   r.   %prepare_decoder_input_ids_from_labels  s   
z.OFAModel.prepare_decoder_input_ids_from_labelsinputs_tensormodel_input_namec                    s|   |   }g d  fdd| D }|dd u r"tdg|d< |d ur(|n| j}|||< |di ||d< d |d< |S )	N)decoder_r  r  r   c                    s,   i | ]\ }t  fd dD s |qS )c                 3   s    | ]}  |V  qd S r"   )
startswith)rJ  r   argumentr-   r.   	<genexpr>  s    zUOFAModel._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>.<genexpr>)r  )rJ  r-  irrelevant_prefixr_  r.   
<dictcomp>  s    zKOFAModel._prepare_encoder_decoder_kwargs_for_generation.<locals>.<dictcomp>r  TrT  r   r-   )rH  itemsgetr&   r   main_input_name)r)   r[  model_kwargsr\  r?  encoder_kwargsr-   rb  r.   ._prepare_encoder_decoder_kwargs_for_generation  s   

z7OFAModel._prepare_encoder_decoder_kwargs_for_generationc                    s.   d}| D ]}|t  fdd|D f7 }q|S )Nr-   c                 3   s    | ]	}| d  V  qdS )r   N)r  )rJ  
past_statebeam_idxr-   r.   ra    s
    

z*OFAModel._reorder_cache.<locals>.<genexpr>)tuple)rW  rm  reordered_past
layer_pastr-   rl  r.   _reorder_cache  s   zOFAModel._reorder_cacher   ro   expand_sizeis_encoder_decoderr   rT  c                 K   s   t | jd ddd|d| j}| d|} d|v r-|d }|d||d< |d ur9|d||d< |rq|d u rCtd|j	d||j	j|d< |j
d||j
j|d< |jd||jj|d	< ||d
< | |fS )Nr   rX   r   token_type_idsr   zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.r3  r6  r4  rT  )r&   rA   rs   r~   repeatr   r(   r  r   r3  r6  r4  )ro   rr  rs  r   rT  rh  expanded_return_idxrt  r-   r-   r.   _expand_inputs_for_generation  sb   
z&OFAModel._expand_inputs_for_generationr"   )NNNNNNNNNNNFFFF)NNNNFN)r   FNN)'r/   r0   r1   r   r   r   r  r  rH  rI  r   OFA_INPUTS_DOCSTRINGr   _TOKENIZER_FOR_DOC_CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOCrM  r   r   r   r   r<  r   r   r  r  r%   rX  r&   rZ  rj  staticmethodrq  r  rI   r   rw  r4   r-   r-   r+   r.   r=    s    


o


r=  )r5   TF)r   r"   r;  )T)Zr   rH   r  dataclassesr   typingr   r   r   r   r&   	packagingr   r   r	   torch.nnr
   r   transformers.activationsr   transformers.file_utilsr   r   r   r   transformers.modeling_outputsr   r   r   transformers.modeling_utilsr   transformers.utilsr   configuration_ofar   generater   resnetr   utils.utilsr   vitr   r   r   r   
get_loggerloggerrz  r{  ry  parse__version__rZ   r[   DEFAULT_MAX_SOURCE_POSITIONSDEFAULT_MAX_TARGET_POSITIONSrI   DEFAULT_MIN_PARAMS_TO_WRAP!OFA_PRETRAINED_MODEL_ARCHIVE_LISTapex.normalizationr!   _FusedLayerNormr8   ImportErrorr9   rT   rj   rn   rw   Sizer?   r   r   r   r   r  r   Moduler   r   r  r"  r2  OFA_START_DOCSTRINGOFA_GENERATION_EXAMPLErx  r+  r*  r=  r-   r-   r-   r.   <module>   s   











 2x /!"    :    !